In [1]:
# -*- coding: utf-8 -*-
# Árbol de Decisión para predecir `exited` (churn) sobre Churn_Modelling-ETL 2.csv

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
    precision_recall_curve,
    auc,
)

# ---------------------
# 1) Carga y preparación de datos
# ---------------------
BASE = Path("../../")              # Ajusta si lo necesitas
DATA = BASE / "Churn_Modelling-ETL 2.csv"

df = pd.read_csv(DATA)

target_col = "exited"
assert target_col in df.columns, "No se encontró la columna objetivo 'exited'."

# Eliminamos identificadores
drop_ids = [c for c in ["row_number", "customer_id", "surname"] if c in df.columns]
X = df.drop(columns=[target_col] + drop_ids, errors="ignore")
y = df[target_col].astype(int)

# Categóricas vs numéricas
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

# ---------------------
# 2) Preprocesamiento + modelo (Pipeline)
# ---------------------
# Nota: el árbol NO exige escalado, pero lo incluimos en el pipeline
# para dejar lista la misma estructura que usaremos con otros modelos.
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),                                  # (opcional para árbol)
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
])

# Hiperparámetros iniciales con ligera regularización para evitar sobreajuste
tree = DecisionTreeClassifier(
    criterion="gini",
    max_depth=6,                 # controla profundidad -> menos overfitting
    min_samples_split=50,        # requiere suficientes muestras para dividir
    min_samples_leaf=25,         # hojas mínimas para suavizar
    random_state=42
)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", tree),
])

# ---------------------
# 3) Train/Test split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# Entrenar
pipe.fit(X_train, y_train)

# ---------------------
# 4) Predicción y métricas (holdout)
# ---------------------
# Árbol no tiene predict_proba calibrada como ensambles, pero se puede usar proba de la hoja
y_score = pipe.predict_proba(X_test)[:, 1]
y_pred  = pipe.predict(X_test)

report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_score)

print("== MÉTRICAS (holdout) – Decision Tree ==")
print(report)
print(f"ROC-AUC: {roc_auc:.4f}")
print("Matriz de confusión:\n", cm)

# ---------------------
# 5) Gráficas de evaluación (Matplotlib)
# ---------------------
# ROC
plt.figure()
RocCurveDisplay.from_predictions(y_test, y_score)
plt.title(f"ROC – DecisionTree (AUC={roc_auc:.3f})")
plt.tight_layout()
plt.savefig("tree_roc.png", dpi=120)
plt.close()

# Precision–Recall
prec, rec, thr = precision_recall_curve(y_test, y_score)
pr_auc = auc(rec, prec)
plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall – DecisionTree (AUC={pr_auc:.3f})")
plt.tight_layout()
plt.savefig("tree_precision_recall.png", dpi=120)
plt.close()

# Matriz de confusión
plt.figure()
plt.imshow(cm, interpolation="nearest")
plt.title("Matriz de confusión – DecisionTree")
plt.colorbar()
ticks = np.arange(2)
plt.xticks(ticks, ["Pred 0","Pred 1"])
plt.yticks(ticks, ["Real 0","Real 1"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.tight_layout()
plt.savefig("tree_confusion_matrix.png", dpi=120)
plt.close()

# ---------------------
# 6) Importancia de variables (interpretabilidad)
# ---------------------
# Recuperamos nombres de features post-OHE para mapear importancias
ohe = pipe.named_steps["prep"].named_transformers_.get("cat")
num_names = num_cols
cat_names = []
if ohe is not None and hasattr(ohe, "get_feature_names_out"):
    cat_names = ohe.get_feature_names_out(cat_cols).tolist()
feature_names = num_names + cat_names

importances = pipe.named_steps["clf"].feature_importances_
n = min(len(importances), len(feature_names))
imp_df = pd.DataFrame({
    "feature": feature_names[:n],
    "importance": importances[:n]
}).sort_values("importance", ascending=False).head(20)

# Barra horizontal
plt.figure(figsize=(7,6))
plt.barh(imp_df["feature"], imp_df["importance"])
plt.gca().invert_yaxis()
plt.xlabel("Importancia")
plt.title("Top 20 features – DecisionTree")
plt.tight_layout()
plt.savefig("tree_feature_importance.png", dpi=120)
plt.close()

# ---------------------
# 7) (Opcional) Visualizar estructura del árbol
# ---------------------
# ¡Cuidado! Si hay muchas columnas tras OHE, el gráfico puede ser grande.
plt.figure(figsize=(16, 9))
plot_tree(
    pipe.named_steps["clf"],
    filled=True,
    max_depth=3,            # muestrario: primeros 3 niveles
    feature_names=feature_names,
    class_names=["stay(0)", "churn(1)"],
    proportion=True,
    rounded=True
)
plt.tight_layout()
plt.savefig("tree_structure_top3.png", dpi=120)
plt.close()

# ---------------------
# 8) (Opcional) Ajuste de umbral
# ---------------------
thr = 0.40  # ejemplo: mueve el umbral para ganar recall de churn
y_pred_thr = (y_score >= thr).astype(int)
print(f"\n== Métricas con umbral personalizado = {thr:.2f} ==")
print(classification_report(y_test, y_pred_thr, digits=4))
print("Matriz de confusión (umbral custom):\n", confusion_matrix(y_test, y_pred_thr))


== MÉTRICAS (holdout) – Decision Tree ==
              precision    recall  f1-score   support

           0     0.8759    0.9674    0.9193      1991
           1     0.7841    0.4637    0.5827       509

    accuracy                         0.8648      2500
   macro avg     0.8300    0.7155    0.7510      2500
weighted avg     0.8572    0.8648    0.8508      2500

ROC-AUC: 0.8496
Matriz de confusión:
 [[1926   65]
 [ 273  236]]

== Métricas con umbral personalizado = 0.40 ==
              precision    recall  f1-score   support

           0     0.8828    0.9568    0.9183      1991
           1     0.7485    0.5029    0.6016       509

    accuracy                         0.8644      2500
   macro avg     0.8156    0.7299    0.7600      2500
weighted avg     0.8554    0.8644    0.8538      2500

Matriz de confusión (umbral custom):
 [[1905   86]
 [ 253  256]]


<Figure size 640x480 with 0 Axes>

In [None]:
# %%
import nbformat as nbf
from pathlib import Path

base = Path("/mnt/data")
nb_path = base / "01_logistic_regression_churn.ipynb"

nb = nbf.v4.new_notebook()
cells = []

cells.append(nbf.v4.new_markdown_cell("""
# Churn – Regresión Logística (Notebook 01)

Este notebook entrena y evalúa **Regresión Logística** para predecir la variable binaria **`exited`** sobre el dataset procesado `Churn_Modelling-ETL 2.csv`.

Incluye:
- Pipeline de preprocesamiento (OneHot + StandardScaler)
- Train/Test Split (estratificado)
- Métricas de **clasificación**: Accuracy, Precision, Recall, F1, ROC-AUC, Matriz de Confusión
- Gráficas con **Seaborn**: matriz de confusión, curva ROC, curva Precision–Recall y barras de métricas
"""))

cells.append(nbf.v4.new_code_cell("""
# Config y librerías
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve
)

sns.set(style="whitegrid")
BASE = Path("/mnt/data")
DATA = BASE / "Churn_Modelling-ETL 2.csv"
"""))

cells.append(nbf.v4.new_code_cell("""
# Carga y preparación
df = pd.read_csv(DATA)
df.columns = [c.strip().replace(" ", "_") for c in df.columns]  # snake_case

target_col = "exited" if "exited" in df.columns else "Exited"
assert target_col in df.columns, "No se encontró la columna objetivo ('exited' o 'Exited')."

drop_ids = [c for c in ["row_number", "customer_id", "surname", "RowNumber", "CustomerId", "Surname"] if c in df.columns]
X = df.drop(columns=[target_col] + drop_ids, errors="ignore")
y = df[target_col].astype(int)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

X.head(), y.value_counts(normalize=True)
"""))

cells.append(nbf.v4.new_code_cell("""
# Pipeline: preprocesamiento + modelo
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
])

logreg = LogisticRegression(max_iter=1000, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", logreg),
])
"""))

cells.append(nbf.v4.new_code_cell("""
# Train/Test split y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

pipe.fit(X_train, y_train)

# Predicciones
y_prob = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.50).astype(int)

# Métricas
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec  = recall_score(y_test, y_pred, zero_division=0)
f1   = f1_score(y_test, y_pred, zero_division=0)
auc  = roc_auc_score(y_test, y_prob)
cm   = confusion_matrix(y_test, y_pred)

print(f"Accuracy     : {acc:.4f}")
print(f"Precision(1) : {prec:.4f}")
print(f"Recall(1)    : {rec:.4f}")
print(f"F1(1)        : {f1:.4f}")
print(f"ROC-AUC      : {auc:.4f}")
print("Matriz de confusión:\\n", cm)

metrics_df = pd.DataFrame({
    "Métrica": ["Accuracy","Precision","Recall","F1","ROC-AUC"],
    "Valor":   [acc, prec, rec, f1, auc]
})
metrics_df
"""))

cells.append(nbf.v4.new_code_cell("""
# Gráfica: Matriz de Confusión (Seaborn)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred 0","Pred 1"], yticklabels=["Real 0","Real 1"])
plt.title("Matriz de Confusión – Logistic Regression")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.tight_layout()
plt.show()
"""))

cells.append(nbf.v4.new_code_cell("""
# Gráfica: Curva ROC (Seaborn/Matplotlib)
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
sns.lineplot(x=fpr, y=tpr)
sns.lineplot(x=[0,1], y=[0,1], linestyle="--")
plt.title(f"Curva ROC (AUC={auc:.3f}) – Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.tight_layout()
plt.show()
"""))

cells.append(nbf.v4.new_code_cell("""
# Gráfica: Curva Precision–Recall (Seaborn)
precisions, recalls, _ = precision_recall_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
sns.lineplot(x=recalls, y=precisions)
plt.title("Curva Precision–Recall – Logistic Regression")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.tight_layout()
plt.show()
"""))

cells.append(nbf.v4.new_code_cell("""
# Gráfica: Barras de métricas (Seaborn)
plt.figure(figsize=(6,4))
sns.barplot(data=metrics_df, x="Métrica", y="Valor")
plt.ylim(0,1)
plt.title("Métricas de Clasificación – Logistic Regression")
plt.tight_layout()
plt.show()
"""))

cells.append(nbf.v4.new_code_cell("""
# (Opcional) Ajuste de umbral para priorizar Recall
thr = 0.35  # ejemplo
y_pred_thr = (y_prob >= thr).astype(int)

acc2  = accuracy_score(y_test, y_pred_thr)
prec2 = precision_score(y_test, y_pred_thr, zero_division=0)
rec2  = recall_score(y_test, y_pred_thr, zero_division=0)
f12   = f1_score(y_test, y_pred_thr, zero_division=0)
cm2   = confusion_matrix(y_test, y_pred_thr)

display(pd.DataFrame({
    "Métrica": ["Accuracy","Precision","Recall","F1"],
    "Valor":   [acc2, prec2, rec2, f12]
}))

plt.figure(figsize=(5,4))
sns.heatmap(cm2, annot=True, fmt="d", cmap="Oranges",
            xticklabels=["Pred 0","Pred 1"], yticklabels=["Real 0","Real 1"])
plt.title(f"Matriz de Confusión – Umbral {thr:.2f}")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.tight_layout()
plt.show()
"""))

nb['cells'] = cells
nbf.write(nb, nb_path)
str(nb_path)
