In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv("/home/ander/Documentos/Universidad/ML/proyecto/train_processed.csv")

def one_hot_encode_categorical(df, column):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_column = encoder.fit_transform(df[[column]])    
    encoded_df = pd.DataFrame(encoded_column, columns=encoder.get_feature_names_out([column]))
    df = pd.concat([df, encoded_df], axis=1)
    df = df.drop(column, axis=1)
    return df, encoder

categorical_columns = ['Geography']
df = df.copy()  

encoders = {}
for column in categorical_columns:
    df, enconder = one_hot_encode_categorical(df, column)
    encoders[column] = enconder
    
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

X = df.drop(columns=["Exited"])
y = df["Exited"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

OPTUNA

In [None]:
import optuna
from optuna import create_study
from optuna.samplers import TPESampler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def objective_rf(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    model = RandomForestClassifier(**params, random_state=42)
    X_train_use, X_test_use = X_train, X_test
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_gb(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0)
    }
    model = GradientBoostingClassifier(**params, random_state=42)
    X_train_use, X_test_use = X_train, X_test
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_xgb(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    X_train_use, X_test_use = X_train, X_test
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_lgb(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2)
    }
    model = LGBMClassifier(**params, random_state=42)
    X_train_use, X_test_use = X_train, X_test
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_lr(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'C': trial.suggest_float('C', 0.01, 10.0),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    model = LogisticRegression(**params, random_state=42, max_iter=2000)
    X_train_use, X_test_use = X_train_scaled, X_test_scaled
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_et(trial):
    sampler_name = trial.suggest_categorical('sampler', ['None', 'SMOTE', 'ADASYN'])
    if sampler_name == 'SMOTE':
        sampler = SMOTE(random_state=42)
    elif sampler_name == 'ADASYN':
        sampler = ADASYN(random_state=42)
    else:
        sampler = None
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }
    model = ExtraTreesClassifier(**params, random_state=42)
    X_train_use, X_test_use = X_train, X_test
    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_train_use, y_train)
    else:
        X_res, y_res = X_train_use, y_train
    model.fit(X_res, y_res)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score



In [None]:
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier
from sklearn.metrics import roc_auc_score

def objective_easyensemble(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }
    model = EasyEnsembleClassifier(
        n_estimators=params['n_estimators'],
        random_state=42, n_jobs=8
    )
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_rusboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    from sklearn.tree import DecisionTreeClassifier
    model = RUSBoostClassifier(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        estimator=DecisionTreeClassifier(max_depth=params['max_depth'], random_state=42),
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_balancedbagging(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    from sklearn.tree import DecisionTreeClassifier
    model = BalancedBaggingClassifier(
        n_estimators=params['n_estimators'],
        max_samples=params['max_samples'],
        max_features=params['max_features'],
        estimator=DecisionTreeClassifier(max_depth=params['max_depth'], random_state=42),
        random_state=42, n_jobs=8
    )
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

def objective_balancedrf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }
    model = BalancedRandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        random_state=42, n_jobs=8
    )
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

In [None]:
# Optuna para EasyEnsembleClassifier
study_easyensemble = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_easyensemble.optimize(objective_easyensemble, n_trials=250)
print("EasyEnsembleClassifier best params:", study_easyensemble.best_params)

In [None]:
# Optuna para RUSBoostClassifier
study_rusboost = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_rusboost.optimize(objective_rusboost, n_trials=250)
print("RUSBoostClassifier best params:", study_rusboost.best_params)

In [None]:
# Optuna para BalancedBaggingClassifier
study_balancedbagging = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_balancedbagging.optimize(objective_balancedbagging, n_trials=250)
print("BalancedBaggingClassifier best params:", study_balancedbagging.best_params)


In [None]:

# Optuna para BalancedRandomForestClassifier
study_balancedrf = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_balancedrf.optimize(objective_balancedrf, n_trials=250)
print("BalancedRandomForestClassifier best params:", study_balancedrf.best_params)

In [None]:

# Ejecuta cada estudio por separado
study_rf = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_rf.optimize(objective_rf, n_trials=100)
print("RandomForest:", study_rf.best_params)



In [None]:
study_gb = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_gb.optimize(objective_gb, n_trials=100)
print("GradientBoosting:", study_gb.best_params)

In [None]:

study_xgb = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_xgb.optimize(objective_xgb, n_trials=100)
print("XGBoost:", study_xgb.best_params)


In [None]:
study_lgb = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_lgb.optimize(objective_lgb, n_trials=500)
print("LightGBM:", study_lgb.best_params)



In [None]:
study_lr = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_lr.optimize(objective_lr, n_trials=100)
print("LogisticRegression:", study_lr.best_params)


In [None]:
study_et = create_study(direction='maximize', sampler=TPESampler(seed=42))
study_et.optimize(objective_et, n_trials=100)
print("ExtraTrees:", study_et.best_params)

(IMBLEARN_EEC)

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Entrena el modelo EasyEnsembleClassifier con los mejores parámetros
best_eec = EasyEnsembleClassifier(
    n_estimators=59,
    random_state=42
)

best_eec.fit(X_train, y_train)
y_pred = best_eec.predict(X_test)
y_pred_proba = best_eec.predict_proba(X_test)[:, 1]

# Evaluar el modelo
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(IMBLEARN_RUSB)

In [None]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Entrena el modelo RUSBoostClassifier. 
# Los parámetros del estimador base (DecisionTreeClassifier) se pasan con '__'.
best_rus = RUSBoostClassifier(
    n_estimators=295,
    learning_rate=0.03169408226050428,
    random_state=42
)

best_rus.fit(X_train, y_train)
y_pred = best_rus.predict(X_test)
y_pred_proba = best_rus.predict_proba(X_test)[:, 1]

# Evaluar el modelo
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(IMBLEARN_BBC)

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Nota: El parámetro max_depth pertenece al estimador base (DecisionTreeClassifier).
# Se debe instanciar y pasar al modelo BalancedBaggingClassifier.

# Entrena el modelo BalancedBaggingClassifier con los mejores parámetros
best_bbc = BalancedBaggingClassifier(

    n_estimators=293,
    max_samples=0.7683980382109316,
    max_features=0.8868546463344018,
    random_state=42
)

best_bbc.fit(X_train, y_train)
y_pred = best_bbc.predict(X_test)
y_pred_proba = best_bbc.predict_proba(X_test)[:, 1]

# Evaluar el modelo
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(IMBLEARN_BRFC)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Entrena el modelo BalancedRandomForestClassifier con los mejores parámetros
best_brf = BalancedRandomForestClassifier(
    n_estimators=181,
    max_depth=7,
    min_samples_split=6,
    min_samples_leaf=9,
    max_features=None,
    random_state=42
)

best_brf.fit(X_train, y_train)
y_pred = best_brf.predict(X_test)
y_pred_proba = best_brf.predict_proba(X_test)[:, 1]

# Evaluar el modelo
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Definir y entrenar el modelo
best_rf = RandomForestClassifier(
    n_estimators=136,
    max_depth=10,
    min_samples_split=6,
    min_samples_leaf=5,
    max_features='log2',
    bootstrap=False,
    random_state=42
)
best_rf.fit(X_train, y_train)

# Evaluar sobre el conjunto de prueba
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_GB)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# 1. Crear el pipeline que integra SMOTE y el clasificador
pipeline_gb = Pipeline([
    ('sampler', SMOTE(random_state=42)),
    ('classifier', GradientBoostingClassifier(
        n_estimators=261,
        learning_rate=0.03643803195448914,
        max_depth=5,
        min_samples_split=8,
        min_samples_leaf=6,
        subsample=0.7970512036903703,
        random_state=42
    ))
])

# 2. Entrenar el pipeline completo
pipeline_gb.fit(X_train, y_train)

# 3. Evaluar sobre el conjunto de prueba original (el pipeline maneja todo internamente)
y_pred = pipeline_gb.predict(X_test)
y_pred_proba = pipeline_gb.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_XGB)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Definir y entrenar el modelo
best_xgb = XGBClassifier(
    n_estimators=108,
    learning_rate=0.09830372052041464,
    max_depth=10,
    min_child_weight=2,
    subsample=0.7373717489359468,
    colsample_bytree=0.7679078375986028,
    gamma=4.519428792014974,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
best_xgb.fit(X_train, y_train)

# Evaluar sobre el conjunto de prueba
y_pred = best_xgb.predict(X_test)
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_LGBM)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

# 1. Crear el pipeline que integra ADASYN y el clasificador
pipeline_lgbm = Pipeline([
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LGBMClassifier(
        n_estimators=157,
        learning_rate=0.05320213755207298,
        max_depth=6,
        min_child_samples=17,
        subsample=0.9669128821218812,
        colsample_bytree=0.7484949223442707,
        reg_alpha=0.6488953630452698,
        reg_lambda=0.8732792139771435,
        random_state=42
    ))
])

# 2. Entrenar el pipeline completo
pipeline_lgbm.fit(X_train, y_train)

# 3. Evaluar sobre el conjunto de prueba original
y_pred = pipeline_lgbm.predict(X_test)
y_pred_proba = pipeline_lgbm.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_LR)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

# 1. Crear el pipeline que integra ADASYN y el clasificador
pipeline_lr = Pipeline([
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(
        C=9.913728258413286,
        penalty='l1',
        solver='liblinear',
        class_weight=None,
        random_state=42
    ))
])

# 2. Entrenar el pipeline completo
pipeline_lr.fit(X_train, y_train)

# 3. Evaluar sobre el conjunto de prueba original
y_pred = pipeline_lr.predict(X_test)
y_pred_proba = pipeline_lr.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

(OPTUNA_ExTr)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

# Definir y entrenar el modelo
best_et = ExtraTreesClassifier(
    n_estimators=375,
    max_depth=14,
    min_samples_split=15,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42
)
best_et.fit(X_train, y_train)

# Evaluar sobre el conjunto de prueba
y_pred = best_et.predict(X_test)
y_pred_proba = best_et.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensibilidad = tp / (tp + fn) if (tp + fn) > 0 else 0
especificidad = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = (sensibilidad * especificidad) ** 0.5

print(f"Sensibilidad: {sensibilidad:.4f}")
print(f"Especificidad: {especificidad:.4f}")
print(f"G-mean: {gmean:.4f}")

### Submissions

In [None]:
#Submit predictions

test = pd.read_csv("/home/ander/Documentos/Universidad/ML/proyecto/test.csv")
test1 = test.drop(columns=["Surname", "id", "CustomerId"])

test_encoded = test1.copy()
for column in categorical_columns:
    one = encoders[column]
    encoded = one.transform(test_encoded[[column]])
    encoded_df = pd.DataFrame(encoded, columns=one.get_feature_names_out([column]), index=test_encoded.index)
    test_encoded = pd.concat([test_encoded.drop(column, axis=1), encoded_df], axis=1)

# Gender column
test_encoded['Gender'] = test_encoded['Gender'].map({'Male': 0, 'Female': 1})

# Asegura columnas y orden
test_encoded = test_encoded[X_train.columns]

# Realiza las predicciones
test_pred = best_brf.predict(test_encoded)
test_pred_proba = best_brf.predict_proba(test_encoded)[:, 1]

# Prepara el archivo de submit
submit = pd.DataFrame({
    "id": test["id"],
    "Exited": test_pred_proba
})

# Guarda el archivo
submit.to_csv("brf_250.csv", index=False)
print("Archivo de submit guardado como submission_brf_250.csv")


In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_all_roc_with_xgb_variants(X_train, X_test, y_train, y_test):
    models = {
        'RandomForest': best_rf,
        'GradientBoosting': pipeline_gb,
        'XGBoost (Optuna)': best_xgb,
        'LightGBM': pipeline_lgbm,
        'LogisticRegression': pipeline_lr,
        'ExtraTrees': best_et,
        'EasyEnsemble': best_eec,
        'RUSBoost': best_rus,
        'BalancedBagging': best_bbc,
        'BalancedRF': best_brf
    }

    # XGBoost variantes
    from xgboost import XGBClassifier
    xgb_params = {
        'n_estimators': 200,
        'learning_rate': 0.1107,
        'max_depth': 9,
        'min_child_weight': 10,
        'subsample': 0.8266,
        'colsample_bytree': 0.7236,
        'gamma': 4.514,
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    # Sin sampler
    xgb_no_sampler = XGBClassifier(**xgb_params)
    xgb_no_sampler.fit(X_train, y_train)
    y_pred_proba_xgb_no_sampler = xgb_no_sampler.predict_proba(X_test)[:, 1]

    # Con SMOTE
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    xgb_smote = XGBClassifier(**xgb_params)
    xgb_smote.fit(X_train_smote, y_train_smote)
    y_pred_proba_xgb_smote = xgb_smote.predict_proba(X_test)[:, 1]

    # Con ADASYN
    adasyn = ADASYN(random_state=42)
    X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
    xgb_adasyn = XGBClassifier(**xgb_params)
    xgb_adasyn.fit(X_train_adasyn, y_train_adasyn)
    y_pred_proba_xgb_adasyn = xgb_adasyn.predict_proba(X_test)[:, 1]

    plt.figure(figsize=(12, 9))
    # Otros modelos
    for name, model in models.items():
        try:
            # Si el modelo no está entrenado, entrenar (solo para modelos no pipeline)
            if hasattr(model, 'fit') and not hasattr(model, 'classes_'):
                model.fit(X_train, y_train)
            if hasattr(model, 'predict_proba'):
                y_pred_proba = model.predict_proba(X_test)[:, 1]
            elif hasattr(model, 'decision_function'):
                from sklearn.preprocessing import MinMaxScaler
                scores = model.decision_function(X_test).reshape(-1, 1)
                y_pred_proba = MinMaxScaler().fit_transform(scores).flatten()
            else:
                y_pred_proba = model.predict(X_test)
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.5f})')
        except Exception as e:
            print(f'No se pudo graficar {name}: {e}')

    # Agrega la curva ROC del modelo LightGBM(TPOT) desde CSV
    import pandas as pd
    try:
        roc_tpot = pd.read_csv('roc_modelo.csv')
        fpr_tpot = roc_tpot['fpr']
        tpr_tpot = roc_tpot['tpr']
        auc_tpot = roc_tpot['auc'].iloc[0] if 'auc' in roc_tpot.columns else auc(fpr_tpot, tpr_tpot)
        plt.plot(fpr_tpot, tpr_tpot, lw=2, color='orange', linestyle='-.', label=f'LightGBM(TPOT) (AUC = {auc_tpot:.5f})')
    except Exception as e:
        print(f'No se pudo cargar roc_modelo.csv: {e}')

    # XGBoost variantes
    for name, y_pred_proba in zip([
        'XGBoost (N)',
        'XGBoost (P - SMOTE)',
        'XGBoost (P - ADASYN)'
    ], [y_pred_proba_xgb_no_sampler, y_pred_proba_xgb_smote, y_pred_proba_xgb_adasyn]):
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, linestyle='--', label=f'{name} (AUC = {roc_auc:.5f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle=':')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves de los Modelos Optimizados y XGBoost variantes')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

# Llama la función para graficar todas las curvas ROC incluyendo XGBoost variantes
plot_all_roc_with_xgb_variants(X_train, X_test, y_train, y_test)