# EDA

In [37]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

In [59]:
data = pd.read_csv('../data/dataset.csv', delimiter=';', decimal=',')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 100 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rev_Mean          99643 non-null   float64
 1   mou_Mean          99643 non-null   float64
 2   totmrc_Mean       99643 non-null   float64
 3   da_Mean           99643 non-null   float64
 4   ovrmou_Mean       99643 non-null   float64
 5   ovrrev_Mean       99643 non-null   float64
 6   vceovr_Mean       99643 non-null   float64
 7   datovr_Mean       99643 non-null   float64
 8   roam_Mean         99643 non-null   float64
 9   change_mou        99109 non-null   float64
 10  change_rev        99109 non-null   float64
 11  drop_vce_Mean     100000 non-null  float64
 12  drop_dat_Mean     100000 non-null  float64
 13  blck_vce_Mean     100000 non-null  float64
 14  blck_dat_Mean     100000 non-null  float64
 15  unan_vce_Mean     100000 non-null  float64
 16  unan_dat_Mean     10

In [60]:
data.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


In [61]:
def correct_missings(df, miss_pct_th = 33, th_num = 0.05, th_chi=0.05):
    
    missings_pct = (df.isnull().sum()/len(df)) * 100
    
    # Eliminamos directamente las columnas con un % de missing superior al 33%

    cols_to_drop = missings_pct[(missings_pct > miss_pct_th)].index.tolist()
    
    print("Se van a dropear las siguientes por excesivos nulos:\n", cols_to_drop)

    df = df.drop(columns = cols_to_drop)
    
    # Las columnas con missings entre 0 y 33 las dividiremos en 2 grupos, numéricas y categóricas:
    
    columns_missings = missings_pct[(missings_pct < miss_pct_th) & (missings_pct > 0)].index.tolist()
    df_missings = df[columns_missings + ['churn']]
    
    df_num_missings = df_missings.select_dtypes(include=[np.number])
    df_cat_missings = df_missings.select_dtypes(include=[object])
    
    # Numéricas
    #print("Las columnas numéricas con nulos son las siguientes:\n", df_num_missings.index )
    
    corr_with_churn = df_num_missings.corrwith(df_num_missings['churn'])
    
    cols_to_keep = corr_with_churn[abs(corr_with_churn) >= th_num].index.tolist()
    
    cols_to_impute = [col for col in cols_to_keep if col != 'churn']

    print("Se van a imputar con la mediana las siguientes columnas:\n", cols_to_impute)

    for col in cols_to_impute:
        median = df_num_missings[col].median()
        df[col] = df[col].fillna(median)
    
    cols_to_drop = corr_with_churn[abs(corr_with_churn) < th_num].index.tolist()
    
    print("Se van a dropear las siguientes por baja correlación con la columna churn:\n", cols_to_drop)
    
    df = df.drop(columns = cols_to_drop)
    
    # Categóricas
    
    if 'churn' not in df_cat_missings.columns:
        df_cat_missings['churn'] = df['churn']
        
    def chi2_test(cols, target):
        cont_table = pd.crosstab(cols, target)
        res = chi2_contingency(cont_table)
        return res.pvalue
    
    chi2_res = df_cat_missings.apply(lambda x: chi2_test(x, df['churn'])).sort_values()
    
    #print(chi2_res)
    
    cols_to_keep = chi2_res[chi2_res <= th_chi].index.tolist()

    cols_to_impute = [col for col in cols_to_keep if col != 'churn']
    
    print("Se van a imputar con la moda las siguientes columnas:\n", cols_to_impute)
    
    for col in cols_to_impute:
        mode = df_cat_missings[col].mode()[0]
        df[col] = df[col].fillna(mode)
            
    cols_to_drop = chi2_res[chi2_res > th_chi].index.tolist()
    
    print("Se van a dropear las siguientes por baja correlación con la columna churn:\n", cols_to_drop)
    
    df = df.drop(columns = cols_to_drop)
    
    return df
    

In [62]:
def correct_outliers(df, outlier_rate = 3):
    df = df.select_dtypes(include=[np.number])
    features = df.columns.to_list()

    for f in features:
        Q1 = np.percentile(df[f],25)
        Q3 = np.percentile(df[f],75)
        IQR = Q3 - Q1

        low_bound = Q1 - (IQR * outlier_rate)
        up_bound = Q3 + (IQR * outlier_rate)

        median = df[f].median()

        df[f] = np.where((df[f] < low_bound) | (df[f] > up_bound), median, df[f])

        df = df.loc[:, (df != df.iloc[0]).any()]

    return df

In [63]:
def feature_engineering(df, th = 0.8):
    df = df.select_dtypes(include=[np.number])
    corr_matrix = df.corr().abs()
    corr_triu = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    cols_to_drop = []
    for column in corr_triu.columns:
        for row in corr_triu.index:
            if corr_triu.loc[row, column] > th:
                print(f"Par: ({row}, {column}) - Correlación: {corr_triu.loc[row, column]:.2f}")
                cols_to_drop.append(column)
                break
    df_red = df.drop(columns = cols_to_drop)

    return df_red

Eliminamos directamente las columnas que tienen unos valores de missing mayores al 33%

In [64]:
data.drop(["Customer_ID"], axis = 1, inplace=True)

data = correct_missings(data, miss_pct_th = 0.33)

data = correct_outliers(data)

Se van a dropear las siguientes por excesivos nulos:
 ['rev_Mean', 'mou_Mean', 'totmrc_Mean', 'da_Mean', 'ovrmou_Mean', 'ovrrev_Mean', 'vceovr_Mean', 'datovr_Mean', 'roam_Mean', 'change_mou', 'change_rev', 'avg6mou', 'avg6qty', 'avg6rev', 'prizm_social_one', 'hnd_price', 'hnd_webcap', 'truck', 'rv', 'ownrent', 'lor', 'dwlltype', 'marital', 'adults', 'infobase', 'income', 'numbcars', 'HHstatin', 'dwllsize', 'forgntvl', 'ethnic', 'kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17', 'creditcd']
Se van a imputar con la mediana las siguientes columnas:
 ['eqpdays']
Se van a dropear las siguientes por baja correlación con la columna churn:
 ['phones', 'models']
Se van a imputar con la moda las siguientes columnas:
 ['dualband', 'area', 'refurb_new']
Se van a dropear las siguientes por baja correlación con la columna churn:
 []


In [65]:
data = feature_engineering(data)

data.to_csv('../data/data_clean.csv', index=False)

Par: (plcd_vce_Mean, comp_vce_Mean) - Correlación: 0.95
Par: (ccrndmou_Mean, cc_mou_Mean) - Correlación: 0.83
Par: (recv_vce_Mean, inonemin_Mean) - Correlación: 0.85
Par: (plcd_vce_Mean, peak_vce_Mean) - Correlación: 0.81
Par: (plcd_vce_Mean, attempt_Mean) - Correlación: 0.99
Par: (plcd_vce_Mean, complete_Mean) - Correlación: 0.95
Par: (totcalls, totmou) - Correlación: 0.81
Par: (totrev, adjrev) - Correlación: 0.99
Par: (totcalls, adjmou) - Correlación: 0.81
Par: (totcalls, adjqty) - Correlación: 0.99
Par: (avgmou, avgqty) - Correlación: 0.81
Par: (avgmou, avg3mou) - Correlación: 0.83
Par: (plcd_vce_Mean, avg3qty) - Correlación: 0.81


# Modelos

## Train-Test Split

In [66]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
X = data[data.columns.difference(['churn'])]
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77,stratify=y)

## One-Hot Encoding

In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

OHE = OneHotEncoder(handle_unknown='ignore')
scaler = RobustScaler()
RFC = RandomForestClassifier(random_state=77)

cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(include = ['int64', 'float64']).columns

num_transf = Pipeline(steps=[
    ('scaler', scaler),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])

transformer = ColumnTransformer([('cat', OHE, cat_features), ('num', num_transf, num_features)])

models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=77),
        'params': {
            'classifier__C': [ 1, 10, 100],
            'classifier__max_iter': [256, 512, 1024]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=77),
        'params': {
            'classifier__n_estimators': [128, 256, 512],
            'classifier__max_depth': [None, 10, 20],
            'classifier__criterion':['gini', 'entropy', 'log_loss'],
            'classifier__max_features':['sqrt', 'log2']
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=77),
        'params': {
            'classifier__n_estimators': [128, 256, 512],
            'classifier__learning_rate': [0.01, 0.1, 1]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'classifier__n_neighbors': [5, 7, 9]}
    }
}

best_model = None
best_params = None
best_score = -1
best_model_name = None

for name, model_dict in models.items():
    pipe = Pipeline([
        ('preprocessing', transformer),
        ('feature_selection', SelectKBest(f_classif, k=20)),
        ('classifier', model_dict['model'])
    ])
    
    grid = GridSearchCV(pipe, param_grid=model_dict['params'], cv=2, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], refit='roc_auc', verbose = 3)
    grid.fit(X_train, y_train)
    
    if grid.best_score_ > best_score:
            best_score = grid.best_score_
            best_model = grid.best_estimator_
            best_params = grid.best_params_
            best_model_name = name
    
    print(f"Los mejores parámetros para el modelo {name} son: {grid.best_params_}")
    print(f" La CV score para el modelo {name} es: {grid.best_score_:.2f}")

print(f"\nEl mejor modelo es {best_model_name} con parámetros {best_params} y un score de {best_score:.2f}")


Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END classifier__C=1, classifier__max_iter=256; accuracy: (test=0.557) f1: (test=0.536) precision: (test=0.558) recall: (test=0.515) roc_auc: (test=0.579) total time=   0.5s
[CV 2/2] END classifier__C=1, classifier__max_iter=256; accuracy: (test=0.555) f1: (test=0.536) precision: (test=0.555) recall: (test=0.518) roc_auc: (test=0.578) total time=   0.5s
[CV 1/2] END classifier__C=1, classifier__max_iter=512; accuracy: (test=0.557) f1: (test=0.536) precision: (test=0.558) recall: (test=0.515) roc_auc: (test=0.579) total time=   0.5s
[CV 2/2] END classifier__C=1, classifier__max_iter=512; accuracy: (test=0.555) f1: (test=0.536) precision: (test=0.555) recall: (test=0.518) roc_auc: (test=0.578) total time=   0.6s
[CV 1/2] END classifier__C=1, classifier__max_iter=1024; accuracy: (test=0.557) f1: (test=0.536) precision: (test=0.558) recall: (test=0.515) roc_auc: (test=0.579) total time=   0.5s
[CV 2/2] END classifier__C=1,

In [70]:
final_model = models[best_model_name]['model']
final_params = {f'classifier__{k.split("__")[1]}': v for k, v in best_params.items()}

best_model_pipeline = Pipeline([
    ('preprocessing', transformer),
    ('feature_selection', SelectKBest(f_classif, k=20)),
    ('classifier', final_model)
])

best_model_pipeline.set_params(**final_params)

best_model_pipeline.fit(X_train, y_train)

In [72]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

y_pred = best_model_pipeline.predict(X_test)
y_pred_prob = best_model_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f"Test Accuracy: {accuracy:.2f}")
print(f"Test Precision: {precision:.2f}")
print(f"Test Recall: {recall:.2f}")
print(f"Test F1 Score: {f1:.2f}")
print(f"Test ROC AUC: {roc_auc:.2f}")

Test Accuracy: 0.59
Test Precision: 0.58
Test Recall: 0.66
Test F1 Score: 0.62
Test ROC AUC: 0.64
