# MLflow + Optuna + GridSearch para Churn (Telco)

Este cuaderno demuestra cómo registrar experimentos en MLflow mientras se ajustan modelos como XGBoost, LightGBM y MLPClassifier usando GridSearchCV, RandomizedSearchCV y Optuna.

In [None]:
#! pip install mlflow

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import optuna

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Cargar dataset Telco
url = '../Data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(url)
df = df.drop(columns=['customerID'])
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()
df = pd.get_dummies(df, drop_first=True)

X = df.drop('Churn_Yes', axis=1)
y = df['Churn_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True


## GridSearchCV con XGBoost

In [None]:
params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1]
}

grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid=params, cv=3)
with mlflow.start_run(run_name='XGBoost GridSearch'):
    grid.fit(X_train, y_train)
    preds = grid.predict(X_test)
    acc = accuracy_score(y_test, preds)
    for param, val in grid.best_params_.items():
        mlflow.log_param(param, val)
    mlflow.log_metric('accuracy', acc)
    mlflow.sklearn.log_model(grid.best_estimator_, 'model')

## RandomizedSearchCV con LightGBM

In [None]:
from scipy.stats import randint, uniform

param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2)
}

rand_search = RandomizedSearchCV(LGBMClassifier(), param_distributions=param_dist, n_iter=10, cv=3)
with mlflow.start_run(run_name='LightGBM RandomizedSearch'):
    rand_search.fit(X_train, y_train)
    preds = rand_search.predict(X_test)
    acc = accuracy_score(y_test, preds)
    for param, val in rand_search.best_params_.items():
        mlflow.log_param(param, val)
    mlflow.log_metric('accuracy', acc)
    mlflow.sklearn.log_model(rand_search.best_estimator_, 'model')

## Optuna con MLPClassifier

In [None]:
def objective(trial):
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(64,), (128,), (64, 32)])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-2, log=True)
    learning_rate_init = trial.suggest_float('learning_rate_init', 0.001, 0.1)

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(max_iter=500, random_state=42, 
                              hidden_layer_sizes=hidden_layer_sizes,
                              alpha=alpha, learning_rate_init=learning_rate_init))
    ])
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
with mlflow.start_run(run_name='MLP Optuna'):
    study.optimize(objective, n_trials=20)
    for k, v in study.best_params.items():
        mlflow.log_param(k, v)
    mlflow.log_metric('accuracy', study.best_value)

## Visualización y Comparación de Resultados

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import seaborn as sns

# Función para visualizar métricas de un modelo
def evaluar_modelo(nombre, modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    y_proba = modelo.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    print(f"\n{name}: AUC = {roc_auc:.3f}")
    
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Matriz de Confusión - {nombre}")
    plt.show()

    plt.figure()
    plt.plot(fpr, tpr, label=f'{nombre} (AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('Tasa de Falsos Positivos')
    plt.ylabel('Tasa de Verdaderos Positivos')
    plt.title(f'Curva ROC - {nombre}')
    plt.legend()
    plt.grid(True)
    plt.show()

    sns.histplot(y_proba, kde=True, hue=y_test, bins=30)
    plt.title(f'Distribución de probabilidades - {nombre}')
    plt.xlabel('Probabilidad predicha')
    plt.grid(True)
    plt.show()

In [None]:
# Evaluación de modelos después del ajuste
evaluar_modelo('XGBoost', grid.best_estimator_, X_test, y_test)
evaluar_modelo('LightGBM', rand_search.best_estimator_, X_test, y_test)

# Reconstruir MLP final con mejores parámetros
from sklearn.pipeline import Pipeline
best_mlp = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(max_iter=500, random_state=42, 
        hidden_layer_sizes=study.best_params['hidden_layer_sizes'],
        alpha=study.best_params['alpha'],
        learning_rate_init=study.best_params['learning_rate_init']))
])
best_mlp.fit(X_train, y_train)
evaluar_modelo('MLP Optuna', best_mlp, X_test, y_test)