In [1]:
# LIBRERIAS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib as jl
import fastapi
import uvicorn

In [2]:
df = pd.read_csv('../data/landing_customer_churn.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440833 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 40.4+ MB


In [4]:
df.describe()

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0
mean,225398.667955,39.373153,31.256336,15.807494,3.604437,12.965722,631.616223,14.480868,0.567107
std,129531.91855,12.442369,17.255727,8.586242,3.070218,8.258063,240.803001,8.596208,0.495477
min,2.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,113621.75,29.0,16.0,9.0,1.0,6.0,480.0,7.0,0.0
50%,226125.5,39.0,32.0,16.0,3.0,12.0,661.0,14.0,1.0
75%,337739.25,48.0,46.0,23.0,6.0,19.0,830.0,22.0,1.0
max,449999.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [5]:
# ======================================
# 1. LIMPIEZA DE DATOS
# ======================================

# Eliminar filas con target nulo (Churn)
df = df.dropna(subset=["Churn"])

# (Opcional) Verificar valores faltantes en el resto del dataset
print("Valores nulos por columna:\n", df.isna().sum())

# ======================================
# 2. DEFINIR VARIABLES
# ======================================

X = df.drop(columns=["Churn", "CustomerID"])  # Quitamos target y ID
y = df["Churn"]

# ======================================
# 3. PREPROCESAMIENTO
# ======================================

# Identificar tipos de variables
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# ======================================
# 4. MODELO
# ======================================

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])

# ======================================
# 5. SPLIT TRAIN/TEST
# ======================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ======================================
# 6. ENTRENAMIENTO
# ======================================

model.fit(X_train, y_train)

# ======================================
# 7. EVALUACIÓN
# ======================================

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\n🔹 Reporte de clasificación:\n", classification_report(y_test, y_pred))
print("\n🔹 Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("\n🔹 ROC-AUC:", roc_auc_score(y_test, y_proba))

Valores nulos por columna:
 CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

🔹 Reporte de clasificación:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     38167
         1.0       1.00      0.98      0.99     50000

    accuracy                           0.99     88167
   macro avg       0.99      0.99      0.99     88167
weighted avg       0.99      0.99      0.99     88167


🔹 Matriz de confusión:
 [[38164     3]
 [  864 49136]]

🔹 ROC-AUC: 0.9998304074200225


In [6]:
# ======================================
# 8. GUARDAR EL MODELO ENTRENADO
# ======================================

# Guardar en archivo local
jl.dump(model, "../models/modelo_churn.joblib")

print("✅ Modelo guardado como 'modelo_churn.joblib'")

# ======================================
# 9. CARGAR EL MODELO POSTERIORMENTE
# ======================================

modelo_cargado = jl.load("../models/modelo_churn.joblib")

# Verificación rápida (opcional)
y_pred_test = modelo_cargado.predict(X_test)
print("\n🔹 Accuracy verificación:", (y_pred_test == y_test).mean())

✅ Modelo guardado como 'modelo_churn.joblib'

🔹 Accuracy verificación: 0.9901663887849196


In [8]:
# Cargar tu modelo
modelo = jl.load("../models/modelo_churn.joblib")

# Revisar qué tipo de objeto es
print("Tipo de modelo:", type(modelo))
print(modelo)

# Si es un pipeline, mostrar sus pasos
if hasattr(modelo, "named_steps"):
    print("\nPasos del pipeline:")
    for paso in modelo.named_steps:
        print("-", paso)

# Si tiene método para ver features
if hasattr(modelo, "feature_names_in_"):
    print("\nVariables que espera el modelo:")
    print(modelo.feature_names_in_)

Tipo de modelo: <class 'sklearn.pipeline.Pipeline'>
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay',
       'Total Spend', 'Last Interaction'],
      dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Gender', 'Subscription Type', 'Contract Length'], dtype='object'))])),
                ('classifier',
                 RandomForestClassifier(max_depth=10, n_estimators=200,
                                        n_jobs=-1, random_state=42))])

Pasos del pipeline:
- preprocessor
- classifier

Variables que espera el modelo:
['Age' 'Gender' 'Tenure' 'Usage Frequency' 'Support Calls' 'Payment Delay'
 'Subscription Type' 'Contract Length' 

In [9]:
df

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.00,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.00,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.00,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.00,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.00,20.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
440828,449995.0,42.0,Male,54.0,15.0,1.0,3.0,Premium,Annual,716.38,8.0,0.0
440829,449996.0,25.0,Female,8.0,13.0,1.0,20.0,Premium,Annual,745.38,2.0,0.0
440830,449997.0,26.0,Male,35.0,27.0,1.0,5.0,Standard,Quarterly,977.31,9.0,0.0
440831,449998.0,28.0,Male,55.0,14.0,2.0,0.0,Standard,Quarterly,602.55,2.0,0.0
