In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import os
import joblib

In [6]:
ruta_dataset = os.path.join("..", "data", "raw", "Telco_churn.csv")
df = pd.read_csv(ruta_dataset)

In [7]:
# Eliminar filas con TotalCharges vacías y convertir a numérico
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

# Convertir la variable objetivo a binaria
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Eliminar columnas que no aportan o que son IDs
df.drop(['customerID'], axis=1, inplace=True)

# Codificar variables categóricas
categorical_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Dividir en X (features) e y (target)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

# Predicciones
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Evaluación
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1033
           1       0.65      0.51      0.57       374

    accuracy                           0.80      1407
   macro avg       0.74      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407

Matriz de Confusión:
[[931 102]
 [184 190]]
ROC AUC Score: 0.8296897567440247


Pruebo con GridSearch y clases balanceadas para mejorarlo

In [11]:
df = pd.read_csv(ruta_dataset)

# Eliminar columnas irrelevantes y convertir target
df.drop("customerID", axis=1, inplace=True)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Convertir variables categóricas a dummies
df = pd.get_dummies(df)

# Separar variables predictoras y variable objetivo
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Dividir en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Calcular el ratio para balanceo de clases
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

# Modelo base XGBoost
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Parámetros para Grid Search
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='recall',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Entrenar
grid_search.fit(X_train, y_train)

# Predicciones
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Resultados
print(" Best Parameters:", grid_search.best_params_)
print("\n Reporte de Clasificación:")
print(classification_report(y_test, y_pred))
print("\n Matriz de confusion:")
print(confusion_matrix(y_test, y_pred))
print("\n ROC AUC Score:", roc_auc_score(y_test, y_proba))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
 Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

 Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.91      0.69      0.79      1552
           1       0.49      0.82      0.61       561

    accuracy                           0.73      2113
   macro avg       0.70      0.76      0.70      2113
weighted avg       0.80      0.73      0.74      2113


 Matriz de confusion:
[[1075  477]
 [ 101  460]]

 ROC AUC Score: 0.8409653692779829


In [12]:
ruta_modelo = r"C:\Users\Merche\REPOSITORIO_MI_PORTFOLIO\Predictor de Churn\ML\models\modelo_XGBoost.joblib"
joblib.dump(best_model, ruta_modelo)

['C:\\Users\\Merche\\REPOSITORIO_MI_PORTFOLIO\\Predictor de Churn\\ML\\models\\modelo_XGBoost.joblib']

El modelo ha mejorado. Voy a mirar la importancia de cada variable por si consigo que mejore mas

In [None]:
# Obtener importancia de características del modelo final
importances = best_model.feature_importances_
features = X_train.columns

# Crear DataFrame ordenado
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df.head(21))

                                    Feature  Importance
35                  Contract_Month-to-month    0.540802
15              InternetService_Fiber optic    0.075994
26                           TechSupport_No    0.058568
17                        OnlineSecurity_No    0.048403
42           PaymentMethod_Electronic check    0.038236
37                        Contract_Two year    0.024415
34                      StreamingMovies_Yes    0.024098
36                        Contract_One year    0.023268
16                       InternetService_No    0.022773
38                      PaperlessBilling_No    0.016876
1                                    tenure    0.016455
2                            MonthlyCharges    0.010449
20                          OnlineBackup_No    0.010146
3                             gender_Female    0.008874
11                         MultipleLines_No    0.007884
7                             Dependents_No    0.007423
29                           StreamingTV_No    0

In [None]:
# Obtener la importancia de características en formato dict
importance_dict = xgb_model.get_booster().get_score(importance_type='weight')

# Convertir a DataFrame para ordenarlo y verlo más fácil
importance_df = pd.DataFrame(list(importance_dict.items()), columns=['Feature', 'Importance'])

# Ordenar de mayor a menor importancia
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(importance_df)

                                  Feature  Importance
0                          MonthlyCharges       631.0
1                            TotalCharges       398.0
2                                  tenure       346.0
3          PaymentMethod_Electronic check        89.0
4                      OnlineSecurity_Yes        78.0
5                             gender_Male        69.0
6                    PaperlessBilling_Yes        65.0
7                       Contract_Two year        50.0
8                     StreamingMovies_Yes        50.0
9                           SeniorCitizen        50.0
10                        TechSupport_Yes        50.0
11  PaymentMethod_Credit card (automatic)        47.0
12                      Contract_One year        45.0
13                       PhoneService_Yes        42.0
14                            Partner_Yes        36.0
15                         Dependents_Yes        36.0
16                        StreamingTV_Yes        33.0
17                      Mult

In [None]:
importance_dict = xgb_model.get_booster().get_score(importance_type='gain')
importance_sorted = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))
importance_sorted

{'InternetService_Fiber optic': 123.69844818115234,
 'Contract_Two year': 49.460548400878906,
 'Contract_One year': 34.622589111328125,
 'InternetService_No': 15.434347152709961,
 'tenure': 9.430861473083496,
 'StreamingMovies_Yes': 6.667798042297363,
 'TechSupport_Yes': 4.163204193115234,
 'OnlineSecurity_Yes': 3.843449592590332,
 'PaymentMethod_Electronic check': 3.8424930572509766,
 'MultipleLines_Yes': 3.4845619201660156,
 'StreamingTV_Yes': 3.461530923843384,
 'PaperlessBilling_Yes': 3.3694989681243896,
 'SeniorCitizen': 3.1292738914489746,
 'PhoneService_Yes': 3.0361385345458984,
 'TotalCharges': 2.7324793338775635,
 'PaymentMethod_Mailed check': 2.655231475830078,
 'MonthlyCharges': 2.651841640472412,
 'OnlineBackup_Yes': 2.630007266998291,
 'Dependents_Yes': 2.3971357345581055,
 'gender_Male': 2.1613852977752686,
 'PaymentMethod_Credit card (automatic)': 2.056190252304077,
 'Partner_Yes': 1.9514861106872559,
 'DeviceProtection_Yes': 1.8693760633468628}

In [None]:
importance_dict = {
    'InternetService_Fiber optic': 123.69844818115234,
    'Contract_Two year': 49.460548400878906,
    'Contract_One year': 34.622589111328125,
    'InternetService_No': 15.434347152709961,
    'tenure': 9.430861473083496,
    'StreamingMovies_Yes': 6.667798042297363,
    'TechSupport_Yes': 4.163204193115234,
    'OnlineSecurity_Yes': 3.843449592590332,
    'PaymentMethod_Electronic check': 3.8424930572509766,
    'MultipleLines_Yes': 3.4845619201660156,
    'StreamingTV_Yes': 3.461530923843384,
    'PaperlessBilling_Yes': 3.3694989681243896,
    'SeniorCitizen': 3.1292738914489746,
    'PhoneService_Yes': 3.0361385345458984,
    'TotalCharges': 2.7324793338775635,
    'PaymentMethod_Mailed check': 2.655231475830078,
    'MonthlyCharges': 2.651841640472412,
    'OnlineBackup_Yes': 2.630007266998291,
    'Dependents_Yes': 2.3971357345581055,
    'gender_Male': 2.1613852977752686,
    'PaymentMethod_Credit card (automatic)': 2.056190252304077,
    'Partner_Yes': 1.9514861106872559,
    'DeviceProtection_Yes': 1.8693760633468628
}

# Suma total de las importancias
total_gain = sum(importance_dict.values())

# Convertir a porcentaje
importance_percent = {k: (v / total_gain) * 100 for k, v in importance_dict.items()}

# Ordenar de mayor a menor
importance_percent_sorted = dict(sorted(importance_percent.items(), key=lambda item: item[1], reverse=True))

for feature, percent in importance_percent_sorted.items():
    print(f"{feature}: {percent:.2f}%")

InternetService_Fiber optic: 42.84%
Contract_Two year: 17.13%
Contract_One year: 11.99%
InternetService_No: 5.35%
tenure: 3.27%
StreamingMovies_Yes: 2.31%
TechSupport_Yes: 1.44%
OnlineSecurity_Yes: 1.33%
PaymentMethod_Electronic check: 1.33%
MultipleLines_Yes: 1.21%
StreamingTV_Yes: 1.20%
PaperlessBilling_Yes: 1.17%
SeniorCitizen: 1.08%
PhoneService_Yes: 1.05%
TotalCharges: 0.95%
PaymentMethod_Mailed check: 0.92%
MonthlyCharges: 0.92%
OnlineBackup_Yes: 0.91%
Dependents_Yes: 0.83%
gender_Male: 0.75%
PaymentMethod_Credit card (automatic): 0.71%
Partner_Yes: 0.68%
DeviceProtection_Yes: 0.65%


Para ver si mejora el modelo, decido quitar las features menos relevantes para el entreno

In [None]:
# Cargar el dataset (ajusta la ruta si es necesario)
df = pd.read_csv(ruta_dataset)

# Eliminar columnas irrelevantes y convertir target
df.drop("customerID", axis=1, inplace=True)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Convertir variables categóricas a dummies
df = pd.get_dummies(df)

# Columnas a eliminar (baja importancia)
cols_to_drop = [
    'PaymentMethod_Credit card (automatic)',
    'Partner_Yes',
    'DeviceProtection_Yes',
    'gender_Male',
    'Dependents_Yes',
    'OnlineBackup_Yes',
    'MonthlyCharges',
    'PaymentMethod_Mailed check',
    'TotalCharges',
    'PhoneService_Yes'
]

# Eliminar esas columnas si existen en el dataframe
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

# Separar variables predictoras y variable objetivo
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Dividir en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Calcular el ratio para balanceo de clases
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

# Modelo base XGBoost
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False
)

# Parámetros para Grid Search
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='recall',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Entrenar
grid_search.fit(X_train, y_train)

# Predicciones
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Resultados
print("Best Parameters:", grid_search.best_params_)
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))
print("\nMatriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1552
           1       0.50      0.82      0.62       561

    accuracy                           0.73      2113
   macro avg       0.70      0.76      0.70      2113
weighted avg       0.80      0.73      0.75      2113


Matriz de Confusión:
[[1085  467]
 [ 102  459]]

ROC AUC Score: 0.8373836530863518


Ha empeorado, me quedo con la metrica del modelo anterior