# EDA

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [27]:
data = pd.read_csv('../data/dataset.csv', delimiter=';', decimal=',')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 100 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rev_Mean          99643 non-null   float64
 1   mou_Mean          99643 non-null   float64
 2   totmrc_Mean       99643 non-null   float64
 3   da_Mean           99643 non-null   float64
 4   ovrmou_Mean       99643 non-null   float64
 5   ovrrev_Mean       99643 non-null   float64
 6   vceovr_Mean       99643 non-null   float64
 7   datovr_Mean       99643 non-null   float64
 8   roam_Mean         99643 non-null   float64
 9   change_mou        99109 non-null   float64
 10  change_rev        99109 non-null   float64
 11  drop_vce_Mean     100000 non-null  float64
 12  drop_dat_Mean     100000 non-null  float64
 13  blck_vce_Mean     100000 non-null  float64
 14  blck_dat_Mean     100000 non-null  float64
 15  unan_vce_Mean     100000 non-null  float64
 16  unan_dat_Mean     10

In [52]:
data.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,239975,21925,225,2475,0,0,0,0,0,-15725,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,574925,48275,37425,2475,2275,91,91,0,0,53225,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,1699,1025,1699,0,0,0,0,0,0,-425,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38,75,38,0,0,0,0,0,0,-15,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,5523,5705,7198,0,0,0,0,0,0,385,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


In [28]:
def correct_missings(df, miss_pct_th = 33, threshold_num = 0.1, threshold_chi=0.05):
    
    missings_pct = (df.isnull().sum()/len(df)) * 100
    
    # Eliminamos directamente las columnas con un % de missing superior al 33%
    df = df.drop(columns = missings_pct[missings_pct > miss_pct_th].index)
    
    # Las columnas con missings entre 0 y 33 las dividiremos en 2 grupos, numéricas y categóricas:
    
    columns_missings = missings_pct[(missings_pct < miss_pct_th) & (missings_pct > 0)].index.tolist()
    df_missings = df[columns_missings + ['churn']]
    
    df_num_missings = df_missings.select_dtypes(include=[np.number])
    df_cat_missings = df_missings.select_dtypes(include=[object])
    
    # Numéricas
    print("Las columnas numéricas con nulos son las siguientes:\n", df_num_missings.index )
    
    corr_with_churn = df_num_missings.corrwith(df_num_missings['churn'])
    
    cols_to_keep = corr_with_churn[abs(corr_with_churn) >= threshold_num].index.tolist()
    
    print("Se van a imputar con la mediana las siguientes columnas:\n", cols_to_keep)

    
    if 'churn' not in cols_to_keep:
        cols_to_keep.append('churn')
        
    #print(cols_to_keep)

    for col in cols_to_keep:
        if col != 'churn':
            median = df_num_missings[col].median()
            df[col] = df[col].fillna(median)
    
    cols_to_drop = corr_with_churn[abs(corr_with_churn) < threshold_num].index.tolist()
    
    print("Se van a dropear las siguientes por baja correlación con la columna churn:\n", cols_to_drop)
    
    
    
    df = df.drop(columns = cols_to_drop)
    
    # Categóricas
    
    print("Las columnas numéricas con nulos son las siguientes:\n", df_cat_missings.index )

    
    if 'churn' not in df_cat_missings.columns:
        df_cat_missings['churn'] = df['churn']
        
    def chi2_test(cols, target):
        cont_table = pd.crosstab(cols, target)
        res = chi2_contingency(cont_table)
        return res.pvalue
    
    chi2_res = df_cat_missings.apply(lambda x: chi2_test(x, df['churn'])).sort_values()
    
    #print(chi2_res)
    
    cols_to_keep = chi2_res[chi2_res <= threshold_chi].index.tolist()
    
    print("Se van a imputar con la moda las siguientes columnas:\n", cols_to_keep)
    
    if 'churn' not in cols_to_keep:
        cols_to_keep.append('churn')
    
    for col in cols_to_keep:
        if col != 'churn':
            mode = df_cat_missings[col].mode()[0]
            df[col] = df[col].fillna(mode)
            
    cols_to_drop = chi2_res[chi2_res > threshold_chi].index.tolist()
    
    print("Se van a dropear las siguientes por baja correlación con la columna churn:\n", cols_to_drop)
    
    df = df.drop(columns = cols_to_drop)
    
    return df
    

Eliminamos directamente las columnas que tienen unos valores de missing mayores al 33%

In [29]:
data = correct_missings(data)

data.to_csv('../data/data_clean.csv', index=False)

Las columnas numéricas con nulos son las siguientes:
 RangeIndex(start=0, stop=100000, step=1)
Se van a imputar con la mediana las siguientes columnas:
 ['hnd_price', 'eqpdays', 'churn']
Se van a dropear las siguientes por baja correlación con la columna churn:
 ['rev_Mean', 'mou_Mean', 'totmrc_Mean', 'da_Mean', 'ovrmou_Mean', 'ovrrev_Mean', 'vceovr_Mean', 'datovr_Mean', 'roam_Mean', 'change_mou', 'change_rev', 'avg6mou', 'avg6qty', 'avg6rev', 'phones', 'models', 'truck', 'rv', 'lor', 'adults', 'income', 'forgntvl']
Las columnas numéricas con nulos son las siguientes:
 RangeIndex(start=0, stop=100000, step=1)
Se van a imputar con la moda las siguientes columnas:
 ['churn', 'hnd_webcap', 'ethnic', 'dualband', 'area', 'refurb_new', 'marital', 'prizm_social_one', 'creditcd', 'infobase', 'dwlltype', 'kid0_2']
Se van a dropear las siguientes por baja correlación con la columna churn:
 ['kid3_5', 'kid16_17', 'kid11_15', 'kid6_10']


# Modelos

## Train-Test Split

In [52]:
from sklearn.model_selection import train_test_split, cross_val_score

X = data[data.columns.difference(['churn'])]
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77,stratify=y)

## One-Hot Encoding

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


OHE = OneHotEncoder()
scaler = StandardScaler()
RFC = RandomForestClassifier(random_state=77)

cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(include = ['int64', 'float64']).columns

transformer = ColumnTransformer([('cat', OHE, cat_cols), ('num', scaler, num_cols)])

pipe = Pipeline([("preprocessing", transformer), ("classifier", RFC)])

pipe.fit(X_train,y_train)

cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")

print(f"Mean Cross-Validation Score: {cv_scores.mean():.2f}")

Traceback (most recent call last):
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 211, in _get_response_values
    y_pred = prediction_method(X)
             ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 60

Cross-Validation Scores: [    nan 0.5709  0.51095     nan     nan]
Mean Cross-Validation Score: nan


Traceback (most recent call last):
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 211, in _get_response_values
    y_pred = prediction_method(X)
             ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ignacio.correcher\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 60