# 1. Importando bibliotecas

In [9]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
# para encoding
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
# para calculo del vif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# para balanceo de clases
from imblearn.over_sampling import SMOTE
# para el entrenamiento del modelo
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# para validacion
from sklearn.model_selection import cross_val_score ,train_test_split, StratifiedKFold
from imblearn.pipeline import Pipeline
# para evaluar metricas
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import PrecisionRecallDisplay, average_precision_score
from sklearn.metrics import classification_report
# para optimización de hiperparametros
from sklearn.model_selection import GridSearchCV

# 2. Carga de datos

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/ONE/HACKATHON/DATASET/DF_FEATURE_ENGINNERING/dataset_feature_enginnering_final.csv')

In [11]:
df.sample(5)

Unnamed: 0,antiguedad,plan,metodo_pago,tipo_contrato,frecuencia_uso,cambios_plan,facturas_impagas,tickets_soporte,canal_adquisicion,friccion_del_servicio,engagement_score,valor_plan_num,ratio_valor_uso,riesgo_financiero,cliente_problematico,early_churn_risk,premium_mensual,churn
3208,26,premium,tarjeta_credito,mensual,8,0,0,5,call_center,0.555556,0.471667,14.99,1.665556,0.0,0,0,1,0
496,2,basico,transferencia_bancaria,mensual,15,2,1,0,call_center,0.0,0.385,4.99,0.311875,4.99,0,1,0,1
3187,53,basico,efectivo,anual,9,0,3,3,call_center,0.3,0.5525,4.99,0.499,14.97,0,0,0,1
1465,6,estandar,tarjeta_debito,mensual,15,1,2,10,call_center,0.625,0.455,9.99,0.624375,19.98,0,0,0,0
1829,25,premium,tarjeta_debito,mensual,29,2,2,10,referido,0.333333,0.629167,14.99,0.499667,29.98,0,0,1,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4251 entries, 0 to 4250
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   antiguedad             4251 non-null   int64  
 1   plan                   4251 non-null   object 
 2   metodo_pago            4251 non-null   object 
 3   tipo_contrato          4251 non-null   object 
 4   frecuencia_uso         4251 non-null   int64  
 5   cambios_plan           4251 non-null   int64  
 6   facturas_impagas       4251 non-null   int64  
 7   tickets_soporte        4251 non-null   int64  
 8   canal_adquisicion      4251 non-null   object 
 9   friccion_del_servicio  4251 non-null   float64
 10  engagement_score       4251 non-null   float64
 11  valor_plan_num         4251 non-null   float64
 12  ratio_valor_uso        4251 non-null   float64
 13  riesgo_financiero      4251 non-null   float64
 14  cliente_problematico   4251 non-null   int64  
 15  earl

In [14]:
# Proporcion de churn

df['churn'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
churn,Unnamed: 1_level_1
0,0.746177
1,0.253823


# 3. Selección de variables explicativas y variable objetivo

In [6]:
X = df.drop(columns='churn')
y = df['churn']

columnas = X.columns

# 4. Encoding

In [13]:
preprocesador = make_column_transformer((OneHotEncoder(drop='first'),
                                         ['plan', 'metodo_pago', 'tipo_contrato', 'canal_adquisicion']),
                                        remainder='passthrough',
                                        sparse_threshold = 0)

X = preprocesador.fit_transform(X)

df_codificado = pd.DataFrame(X, columns= preprocesador.get_feature_names_out(columnas))

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)

df_codificado['churn'] = y

print(df_codificado)
print(df_codificado.info())

      onehotencoder__plan_estandar  onehotencoder__plan_premium  \
0                              1.0                          0.0   
1                              1.0                          0.0   
2                              0.0                          1.0   
3                              1.0                          0.0   
4                              1.0                          0.0   
...                            ...                          ...   
4246                           0.0                          1.0   
4247                           1.0                          0.0   
4248                           0.0                          0.0   
4249                           1.0                          0.0   
4250                           1.0                          0.0   

      onehotencoder__metodo_pago_tarjeta_credito  \
0                                            0.0   
1                                            0.0   
2                                      