In [0]:
import os
import pygal
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from scipy.stats import chisquare
from datetime import datetime, timedelta

%matplotlib inline

ModuleNotFoundError: ignored

In [0]:
def plot_barstack(data):
    values = data.value_counts(True).sort_index().reset_index().values
    barstack_chart = pygal.StackedBar()
    barstack_chart.title = 'Categorical variable distribution'
    for category, freq in values:
        barstack_chart.add(category, freq)
    return barstack_chart

def plot_spc(df, date_col, cont_col, freq="month", n_sigmas = 3):
    aux = df[[date_col, cont_col]].copy()
    if freq == "month":
        date_format = "%Y-%m-01"
    elif freq == "day":
        date_format = "%Y-%m-%d"
    elif freq == "year":
        date_format = "%Y-01-01"
    aux[freq] = aux[date_col].dt.strftime(date_format)
    aux = aux[[freq, cont_col]].groupby(freq).mean()
    aux[u"μ"] = aux[cont_col].mean()
    aux[f"μ-{n_sigmas}σ"] = aux[cont_col].mean() - n_sigmas*aux[cont_col].std()
    aux[f"μ+{n_sigmas}σ"] = aux[cont_col].mean() + n_sigmas*aux[cont_col].std()
    line_chart = pygal.Line()
    line_chart.title = 'Statistical Control Process'
    line_chart.x_labels = aux.index
    for col in aux.columns:
        line_chart.add(col, aux[col])
    return line_chart

def plot_histogram(data, n_bins = 10):
    freq, bins = np.histogram(data, bins=n_bins)
    hist = pygal.Histogram()
    hist.add('Wide bars', list(zip(freq, bins, bins[1:])))
    return hist

In [0]:
df = pd.read_csv("/home/oscar/Desktop/Ejercicio/churn.csv")
train = pd.read_csv("/home/oscar/Desktop/Ejercicio/churn_train.csv")
test = pd.read_csv("/home/oscar/Desktop/Ejercicio/churn_test.csv")
val = pd.read_csv("/home/oscar/Desktop/Ejercicio/churn_val.csv")

NameError: ignored

In [0]:
train["fecha"] = train.index.map(lambda x:datetime.now() - timedelta(x))
train["churn"] = train["churn"].astype(str)
test["churn"] = test["churn"].astype(str)
val["churn"] = val["churn"].astype(str)
train["area code"] = train["area code"].astype(str)
test["area code"] = test["area code"].astype(str)
val["area code"] = val["area code"].astype(str)

In [0]:
path_to_save = "/home/oscar/Desktop/Equipo Oscar"

In [0]:
ls_cont = ["account length", "number vmail messages", "total day minutes", "total day calls", "total day charge",
           "total eve minutes", "total eve calls", "total eve charge", "total night minutes", "total night calls",
           "total night charge", "total intl minutes", "total intl calls", "total intl charge", 
           "customer service calls"]
ls_disc = ["state", "area code", "international plan", "voice mail plan", "churn"]

In [0]:
cst = pd.DataFrame(columns=["p-value", "¿Son iguales?", "sets"])
for x in ls_cont:
    cst.loc[x, "p-value"] = ks_2samp(train[x], test[x]).pvalue
cst["¿Son iguales?"] = cst["p-value"].map(lambda x: "No" if x<0.05 else "Si")
cst["sets"] = "train/test"

In [0]:
csv = pd.DataFrame(columns=["p-value", "¿Son iguales?", "sets"])
for x in ls_cont:
    csv.loc[x, "p-value"] = ks_2samp(train[x], val[x]).pvalue
csv["¿Son iguales?"] = csv["p-value"].map(lambda x: "No" if x<0.05 else "Si")
csv["sets"] = "train/val"

In [0]:
cs.append(csv).to_excel(os.path.join(path_to_save, "pruebas.xlsx"))

In [0]:
dst = pd.DataFrame(columns=["p-value", "¿Son iguales?", "sets"])
for x in ls_disc:
    dst.loc[x, "p-value"] = chisquare(f_obs=test[x].value_counts(True).sort_index().values, f_exp=train[x].value_counts(True).sort_index().values).pvalue
dst["¿Son iguales?"] = dst["p-value"].map(lambda x: "Si" if 1-x<0.05 else "No")
dst["sets"] = "train/test"

In [0]:
dsv = pd.DataFrame(columns=["p-value", "¿Son iguales?", "sets"])
for x in ls_disc:
    dsv.loc[x, "p-value"] = chisquare(f_obs=val[x].value_counts(True).sort_index().values, f_exp=train[x].value_counts(True).sort_index().values).pvalue
dsv["¿Son iguales?"] = dsv["p-value"].map(lambda x: "Si" if 1-x<0.05 else "No")
dsv["sets"] = "train/val"

In [0]:
dst.append(dsv).to_excel(os.path.join(path_to_save, "pruebas_disc.xlsx"))

In [0]:
for feat in ls_cont:
    plot_histogram(train[feat]).render_to_png(os.path.join(path_to_save, f"continuas/{feat}_train.png"))
    plot_histogram(test[feat]).render_to_png(os.path.join(path_to_save, f"continuas/{feat}_test.png"))
    plot_histogram(val[feat]).render_to_png(os.path.join(path_to_save, f"continuas/{feat}_val.png"))
    plot_spc(cont_col=feat, date_col="fecha", df=train, freq="month", n_sigmas=3).render_to_png(os.path.join(path_to_save, f"continuas/{feat}_spc_test.png"))

In [0]:
for feat in ls_disc:
    plot_barstack(train[feat]).render_to_png(os.path.join(path_to_save, f"discretas/{feat}_train.png"))
    plot_barstack(test[feat]).render_to_png(os.path.join(path_to_save, f"discretas/{feat}_test.png"))
    plot_barstack(val[feat]).render_to_png(os.path.join(path_to_save, f"discretas/{feat}_val.png"))