In [1]:
# ---------------------------
# 1. Imports
# ---------------------------
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from catboost import CatBoostClassifier
import joblib

# ---------------------------
# 2. Custom transformer para limpiar numéricos
# ---------------------------
class NumericCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.to_numeric(X[col].replace(' ', np.nan), errors='coerce').astype('float64')
        return X

# ---------------------------
# 3. Cargar datos
# ---------------------------
df = pd.read_csv('./ML/data/Telco_churn.csv')

# Target
y = df["Churn"].map({"Yes": 1, "No": 0})

# Eliminar columnas innecesarias
cols_drop = [
    'customerID', 'DeviceProtection', 'StreamingTV', 'gender',
    'PhoneService', 'Dependents', 'TechSupport', 'StreamingMovies'
]
df = df.drop(columns=cols_drop)

# Separar X
X = df.drop(columns=["Churn"])

# Columnas
cat_cols = [
    'SeniorCitizen', 'Partner', 'MultipleLines', 'InternetService',
    'OnlineSecurity', 'OnlineBackup', 'Contract', 'PaperlessBilling', 'PaymentMethod'
]
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# ---------------------------
# 4. Dividir datos
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=82, stratify=y
)

# ---------------------------
# 5. Calcular pesos de clase
# ---------------------------
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {0: class_weights_array[0], 1: class_weights_array[1]}

# ---------------------------
# 6. Preprocesamiento
# ---------------------------
# Subpipeline numérico: limpieza y median imputer
num_pipeline = Pipeline([
    ('cleaner', NumericCleaner(columns=num_cols)),
    ('imputer', SimpleImputer(strategy='median'))
])

# Preprocesamiento final
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

# ---------------------------
# 7. Modelo CatBoost
# ---------------------------
model = CatBoostClassifier(
    iterations=100,
    depth=4,
    learning_rate=0.1,
    l2_leaf_reg=1,
    random_seed=82,
    verbose=0,
    class_weights=class_weights
)

# ---------------------------
# 8. Pipeline final
# ---------------------------
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

# Entrenar
pipeline.fit(X_train, y_train)

# Guardar
joblib.dump(pipeline, "pipeline_churn.joblib")

# Evaluación
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("✅ Pipeline entrenado y guardado correctamente.")
print("\nMatriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.3f}")


✅ Pipeline entrenado y guardado correctamente.

Matriz de Confusión:
[[763 272]
 [ 75 299]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.74      0.81      1035
           1       0.52      0.80      0.63       374

    accuracy                           0.75      1409
   macro avg       0.72      0.77      0.72      1409
weighted avg       0.81      0.75      0.77      1409

AUC-ROC: 0.852
