# Fundamento del modelo

La Regresión Logística es un modelo estadístico supervisado lineal que predice la probabilidad de pertenecer a una clase (por ejemplo, si un cliente se va o permanece).

## Proceso aplicado al dataset

Variable objetivo:

  - exited → 1 = el cliente abandonó, 0 = permaneció.

Entrenamiento/test split: 75% / 25%

Preprocesamiento:

  - Escalado numérico (StandardScaler)
  - Codificación de variables categóricas (OneHotEncoder)

In [1]:
# -*- coding: utf-8 -*-
# Regresión Logística para predecir `exited` (churn) sobre Churn_Modelling-ETL 2.csv

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
    precision_recall_curve,
    auc
)

In [None]:
# ---------------------
# 1) Carga y preparación
# ---------------------
BASE = Path("../../")  # ajusta si es necesario
DATA = BASE / "Churn_Modelling-ETL 2.csv"

df = pd.read_csv(DATA)

target_col = "exited"
assert target_col in df.columns, "No se encontró la columna objetivo 'exited'."

In [3]:
# Quita identificadores que no aportan
drop_ids = [c for c in ["row_number", "customer_id", "surname"] if c in df.columns]
X = df.drop(columns=[target_col] + drop_ids, errors="ignore")
y = df[target_col].astype(int)

# Columnas categóricas/numéricas
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]


In [4]:
# ---------------------
# 2) Pipeline de preprocesamiento + modelo
# ---------------------
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
])

logreg = LogisticRegression(max_iter=1000, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", logreg),
])


In [5]:
# ---------------------
# 3) Train/Test Split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# Entrena
pipe.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [6]:
# ---------------------
# 4) Predicciones, métricas y curvas
# ---------------------
# Scores de probabilidad para ROC/PR
y_score = pipe.predict_proba(X_test)[:, 1]
y_pred = pipe.predict(X_test)

# Métricas
report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_score)

print("== MÉTRICAS (holdout) – Regresión Logística ==")
print(report)
print(f"ROC-AUC: {roc_auc:.4f}")
print("Matriz de confusión:\n", cm)

# Curva ROC (Matplotlib puro)
plt.figure()
RocCurveDisplay.from_predictions(y_test, y_score)
plt.title(f"ROC – LogisticRegression (AUC={roc_auc:.3f})")
plt.tight_layout()
plt.savefig("logreg_roc.png", dpi=120)
plt.close()

# Curva Precision–Recall
prec, rec, thr = precision_recall_curve(y_test, y_score)
pr_auc = auc(rec, prec)
plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall – LogisticRegression (AUC={pr_auc:.3f})")
plt.tight_layout()
plt.savefig("logreg_precision_recall.png", dpi=120)
plt.close()

# Matriz de Confusión (heatmap simple con Matplotlib)
plt.figure()
plt.imshow(cm, interpolation="nearest")
plt.title("Matriz de confusión – LogisticRegression")
plt.colorbar()
ticks = np.arange(2)
plt.xticks(ticks, ["Pred 0","Pred 1"])
plt.yticks(ticks, ["Real 0","Real 1"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.tight_layout()
plt.savefig("logreg_confusion_matrix.png", dpi=120)
plt.close()


== MÉTRICAS (holdout) – Regresión Logística ==
              precision    recall  f1-score   support

           0     0.8441    0.9684    0.9020      1991
           1     0.7083    0.3006    0.4221       509

    accuracy                         0.8324      2500
   macro avg     0.7762    0.6345    0.6620      2500
weighted avg     0.8165    0.8324    0.8043      2500

ROC-AUC: 0.8067
Matriz de confusión:
 [[1928   63]
 [ 356  153]]


<Figure size 640x480 with 0 Axes>

In [7]:
# ---------------------
# 5) Importancia (coeficientes) – interpretación
# ---------------------
# Para mapear coeficientes a nombres de features transformadas:
ohe = pipe.named_steps["prep"].named_transformers_.get("cat")
num_names = num_cols
cat_names = []
if ohe is not None and hasattr(ohe, "get_feature_names_out"):
    cat_names = ohe.get_feature_names_out(cat_cols).tolist()

feature_names = num_names + cat_names

coefs = pipe.named_steps["clf"].coef_.ravel()  # clase positiva
# Asegura misma longitud (por seguridad)
n = min(len(coefs), len(feature_names))
coef_df = pd.DataFrame({"feature": feature_names[:n], "coef": coefs[:n]})
coef_df = coef_df.sort_values("coef", ascending=False)

# Top 15 positivos y 15 negativos
top_pos = coef_df.head(15)
top_neg = coef_df.tail(15).sort_values("coef")

# Barra horizontal (positivos)
plt.figure(figsize=(6,6))
plt.barh(top_pos["feature"], top_pos["coef"])
plt.gca().invert_yaxis()
plt.title("Coeficientes (+) – LogisticRegression")
plt.tight_layout()
plt.savefig("logreg_top_positive_coefs.png", dpi=120)
plt.close()

# Barra horizontal (negativos)
plt.figure(figsize=(6,6))
plt.barh(top_neg["feature"], top_neg["coef"])
plt.gca().invert_yaxis()
plt.title("Coeficientes (−) – LogisticRegression")
plt.tight_layout()
plt.savefig("logreg_top_negative_coefs.png", dpi=120)
plt.close()


In [8]:
# ---------------------
# 6) (Opcional) Ajuste de umbral
# ---------------------
# Si tu negocio prioriza Recall de churn (exited=1), puedes bajar el umbral:
custom_threshold = 0.35  # ejemplo, ajústalo según tu curva PR
y_pred_thr = (y_score >= custom_threshold).astype(int)
print(f"\n== Métricas con umbral personalizado = {custom_threshold:.2f} ==")
print(classification_report(y_test, y_pred_thr, digits=4))
cm_thr = confusion_matrix(y_test, y_pred_thr)
print("Matriz de confusión (umbral custom):\n", cm_thr)


== Métricas con umbral personalizado = 0.35 ==
              precision    recall  f1-score   support

           0     0.8806    0.8850    0.8828      1991
           1     0.5411    0.5305    0.5357       509

    accuracy                         0.8128      2500
   macro avg     0.7108    0.7077    0.7092      2500
weighted avg     0.8114    0.8128    0.8121      2500

Matriz de confusión (umbral custom):
 [[1762  229]
 [ 239  270]]


| Métrica                 | Valor      | Interpretación                                                          |
| ----------------------- | ---------- | ----------------------------------------------------------------------- |
| **ROC-AUC**             | ≈ **0.81** | Buena capacidad para distinguir clientes que se van vs. los que no      |
| **Accuracy**            | ≈ **0.80** | 80% de aciertos totales                                                 |
| **Precision (clase 1)** | ≈ **0.71** | 71% de los clientes predichos como “se va” realmente se van             |
| **Recall (clase 1)**    | ≈ **0.30** | Detecta solo 30% de los clientes que realmente se van (falla en recall) |
| **F1-score (weighted)** | ≈ **0.80** | Balance global de rendimiento                                           |
