In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [119]:
df = pd.read_csv('../data/online_retail_cleaned_2009-2011.csv')
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.normalize()
df = df.drop([
    # 'IsCancelled', 
    # 'TotalPrice',
    ], axis=1).dropna(subset=['CustomerID'])
print(df.shape)
df.head(3)

(824364, 10)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom,False,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.0


In [120]:
df = df[~df['Description'].isin(['Manual','Discount','This is a test product.'])]
df.shape

(823059, 10)

In [125]:
df_compras = df[df['Quantity'] > 0].copy()
df_devoluciones = df[df['Quantity'] < 0].copy()

In [153]:
df_devs_agg = (
    df_devoluciones.groupby(['CustomerID', 'InvoiceDate', 'StockCode', 'Description'])
    .agg({'Quantity': 'sum'})
    .reset_index()
    .rename(columns={'Quantity': 'Devoluciones'})
)
df_devs_agg.head(20)

Unnamed: 0,CustomerID,InvoiceDate,StockCode,Description,Devoluciones
0,12346.0,2010-01-26,ADJUST,Adjustment by john on 26/01/2010 17,-1
1,12346.0,2011-01-18,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215
2,12349.0,2009-12-04,20747,PICCADILLY TEA SET,-1
3,12349.0,2009-12-04,20914,SET/5 RED SPOTTY LID GLASS BOWLS,-1
4,12349.0,2009-12-04,21231,SWEETHEART CERAMIC TRINKET BOX,-1
5,12349.0,2009-12-04,21232,STRAWBERRY CERAMIC TRINKET BOX,-1
6,12349.0,2009-12-04,22072,TEA CUP AND SAUCER RETRO SPOT,-1
7,12352.0,2011-03-22,21914,BLUE HARMONICA IN BOX,-12
8,12352.0,2011-03-22,22413,METAL SIGN TAKE IT OR LEAVE IT,-6
9,12352.0,2011-03-22,22645,CERAMIC HEART FAIRY CAKE MONEY BANK,-12


In [140]:
df_net = df_compras.merge(
    df_devs_agg,
    on=['CustomerID', 'InvoiceDate', 'StockCode','Description'],
    how='left'
)
df_net['Devoluciones'] = df_net['Devoluciones'].fillna(0)
df_net

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice,Devoluciones
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom,False,83.40,0.0
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.00,0.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.00,0.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom,False,100.80,0.0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom,False,30.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...
804882,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France,False,12.60,0.0
804883,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France,False,16.60,0.0
804884,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France,False,16.60,0.0
804885,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France,False,14.85,0.0


In [146]:
ajuste = df_net[['Quantity','Devoluciones']].copy()
ajuste['ajuste'] = ajuste[['Quantity','Devoluciones']].apply(
    lambda x: min(x['Quantity'], -x['Devoluciones']) if x['Devoluciones'] < 0 else 0, axis=1
)
df_net['Quantity_net'] = df_net['Quantity'] - ajuste['ajuste']
df_net['qty_devolucion'] = df_net['Devoluciones'] + ajuste['ajuste']
df_net

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice,Devoluciones,Quantity_net,qty_devolucion
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom,False,83.40,0.0,12.0,0.0
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.00,0.0,12.0,0.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,False,81.00,0.0,12.0,0.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom,False,100.80,0.0,48.0,0.0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom,False,30.00,0.0,24.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
804882,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France,False,12.60,0.0,6.0,0.0
804883,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France,False,16.60,0.0,4.0,0.0
804884,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France,False,16.60,0.0,4.0,0.0
804885,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France,False,14.85,0.0,3.0,0.0


In [152]:
df_net[(df_net['CustomerID'] == 16321)
       # & (df_net['InvoiceDate'] <= '2010-12-07')
       ]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice,Devoluciones,Quantity_net,qty_devolucion
178,489450,22087,PAPER BUNTING WHITE LACE,12,2009-12-01,2.95,16321.0,Australia,False,35.4,-12.0,0.0,0.0
179,489450,85206A,CREAM FELT EASTER EGG BASKET,6,2009-12-01,1.65,16321.0,Australia,False,9.9,-6.0,0.0,0.0
180,489450,21895,POTTING SHED SOW 'N' GROW SET,4,2009-12-01,4.25,16321.0,Australia,False,17.0,-4.0,0.0,0.0
181,489450,21896,POTTING SHED TWINE,6,2009-12-01,2.1,16321.0,Australia,False,12.6,-6.0,0.0,0.0
182,489450,22083,PAPER CHAIN KIT RETRO SPOT,12,2009-12-01,2.95,16321.0,Australia,False,35.4,-12.0,0.0,0.0
183,489450,21871,SAVE THE PLANET MUG,12,2009-12-01,1.25,16321.0,Australia,False,15.0,-12.0,0.0,0.0
184,489450,84946,ANTIQUE SILVER TEA GLASS ETCHED,12,2009-12-01,1.25,16321.0,Australia,False,15.0,-12.0,0.0,0.0
185,489450,84970S,HANGING HEART ZINC T-LIGHT HOLDER,24,2009-12-01,0.85,16321.0,Australia,False,20.4,-24.0,0.0,0.0
186,489450,22090,PAPER BUNTING RETRO SPOTS,12,2009-12-01,2.95,16321.0,Australia,False,35.4,-12.0,0.0,0.0
98303,502275,85206A,CREAM FELT EASTER EGG BASKET,12,2010-03-23,1.65,16321.0,Australia,False,19.8,0.0,12.0,0.0


In [136]:
df_net[(df_net['CustomerID'] == 12349)
       # & (df_net['InvoiceDate'] <= '2009-12-31')
       ]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice,Devoluciones
133431,506394,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2010-04-29,1.65,12349.0,Italy,False,19.80,
133432,506394,22555,PLASTERS IN TIN STRONGMAN,12,2010-04-29,1.65,12349.0,Italy,False,19.80,
133433,506394,22553,PLASTERS IN TIN SKULLS,12,2010-04-29,1.65,12349.0,Italy,False,19.80,
133434,506394,16156S,WRAP PINK FAIRY CAKES,25,2010-04-29,0.42,12349.0,Italy,False,10.50,
133435,506394,22052,VINTAGE CARAVAN GIFT WRAP,25,2010-04-29,0.42,12349.0,Italy,False,10.50,
...,...,...,...,...,...,...,...,...,...,...,...
765210,577609,22064,PINK DOUGHNUT TRINKET POT,12,2011-11-21,1.65,12349.0,Italy,False,19.80,
765211,577609,37448,CERAMIC CAKE DESIGN SPOTTED MUG,12,2011-11-21,1.49,12349.0,Italy,False,17.88,
765212,577609,37500,TEA TIME TEAPOT IN GIFT BOX,12,2011-11-21,4.95,12349.0,Italy,False,59.40,
765213,577609,48185,DOORMAT FAIRY CAKE,4,2011-11-21,8.25,12349.0,Italy,False,33.00,


In [None]:
df_compras = df.groupby(['CustomerID', 'InvoiceDate', 'StockCode', 'Description']).agg({'Quantity': 'sum'}).reset_index()
# df_compras[df_compras['CustomerID']==16321]
df_compras.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761674 entries, 0 to 761673
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   CustomerID   761674 non-null  float64       
 1   InvoiceDate  761674 non-null  datetime64[ns]
 2   StockCode    761674 non-null  object        
 3   Description  761674 non-null  object        
 4   Quantity     761674 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 29.1+ MB


In [None]:
df_compras = df_compras[df_compras['Quantity'] == 0]
df_compras['Anular'] = 1
df_compras
# df_compras[df_compras['CustomerID']==16321]

In [None]:
df_compras_2 = df_compras.merge(df, on=['CustomerID', 'InvoiceDate', 'StockCode','Description','Quantity'], how='left')
df_compras_2[df_compras_2['CustomerID']==16321]
# df_compras_2

In [None]:
df_compras_2 = df.merge(df_compras, on=['CustomerID', 'InvoiceDate', 'StockCode','Description',], how='left')
df_compras_2[df_compras_2['CustomerID']==16321]

In [None]:
df_compras_2.info()

In [None]:
df_compras_2[df_compras_2['InvoiceNo'].isna()]

In [None]:
# 12347
df_compras_2[(df_compras_2['CustomerID'] == 16321)
   # & (df_compras_2['InvoiceDate'] <= '2010-12-07')
   # & (df_compras_2['StockCode'] == '20780')
   ]

In [None]:
df[df['Quantity'] < 0]['Description'].value_counts()

In [158]:
df[(df['CustomerID'] == 12352)
   & (df['InvoiceDate'] == '2011-03-22')
   # & (df['StockCode'] == '20780')
   ].head(30)\
# .sort_values('Description')
# df[df['Quantity']<0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,IsCancelled,TotalPrice
651909,C547388,84050,PINK HEART SHAPE EGG FRYING PAN,-12,2011-03-22,1.65,12352.0,Norway,True,-19.8
651910,C547388,37448,CERAMIC CAKE DESIGN SPOTTED MUG,-12,2011-03-22,1.49,12352.0,Norway,True,-17.88
651911,C547388,22784,LANTERN CREAM GAZEBO,-3,2011-03-22,4.95,12352.0,Norway,True,-14.85
651912,C547388,22701,PINK DOG BOWL,-6,2011-03-22,2.95,12352.0,Norway,True,-17.7
651913,C547388,22645,CERAMIC HEART FAIRY CAKE MONEY BANK,-12,2011-03-22,1.45,12352.0,Norway,True,-17.4
651914,C547388,22413,METAL SIGN TAKE IT OR LEAVE IT,-6,2011-03-22,2.95,12352.0,Norway,True,-17.7
651915,C547388,21914,BLUE HARMONICA IN BOX,-12,2011-03-22,1.25,12352.0,Norway,True,-15.0
651953,547390,POST,POSTAGE,1,2011-03-22,40.0,12352.0,Norway,False,40.0
651954,547390,84050,PINK HEART SHAPE EGG FRYING PAN,12,2011-03-22,1.65,12352.0,Norway,False,19.8
651955,547390,37448,CERAMIC CAKE DESIGN SPOTTED MUG,12,2011-03-22,1.49,12352.0,Norway,False,17.88


In [None]:
df_grup_products = df.groupby(['Description']).agg(
    {'Quantity': 'sum'}
).reset_index().rename(columns={
    'Quantity': 'Total_Quantity'
})
df_grup_products.sort_values(by='Total_Quantity',ascending=False).head(20)

In [None]:
# Ventasas churn
meses_obs = 3
meses_churn = 3
obs_ini = df['InvoiceDate'].min()

ventanas = []
ventana_id = 1

while True:
    obs_end = obs_ini + pd.DateOffset(months=meses_obs) - pd.DateOffset(days=1)
    churn_ini = obs_end + pd.DateOffset(days=1)
    churn_end = churn_ini + pd.DateOffset(months=meses_churn) - pd.DateOffset(days=1)

    if churn_end > df['InvoiceDate'].max():
        break

    df_obs = df[(df['InvoiceDate'] >= obs_ini) & (df['InvoiceDate'] <= obs_end)]
    df_churn = df[(df['InvoiceDate'] >= churn_ini) & (df['InvoiceDate'] <= churn_end)]

    customers_obs = df_obs['CustomerID'].dropna().unique()
    customer_churn = df_churn['CustomerID'].dropna().unique()

    for customer in customers_obs:

        df_cliente = df_obs[df_obs['CustomerID'] == customer].copy()
        df_cliente['quantity_buy'] = np.where(df_cliente['Quantity'] > 0, df_cliente['Quantity'], 0)
        df_cliente['quantity_return'] = np.where(df_cliente['Quantity'] < 0, -df_cliente['Quantity'],0)

        # Compras
        total_compras = df_cliente['quantity_buy'].nunique()
        total_productos_comprados = df_cliente['quantity_buy'].sum()
        total_gasto = (df_cliente['UnitPrice'] * df_cliente['quantity_buy']).sum()
        promedio_monto_factura = total_gasto / total_compras if total_compras > 0 else 0
        promedio_monto_producto = total_gasto / total_productos_comprados if total_productos_comprados > 0 else 0
        productos_unicos_comprados = df_cliente['StockCode'].nunique()

        # Devoluciones
        total_devoluciones = df_cliente['quantity_return'].nunique()
        total_productos_devoluciones = df_cliente['quantity_return'].sum()
        total_gasto_devoluciones = (df_cliente['UnitPrice'] * df_cliente['quantity_return']).sum()
        productos_unicos_devoluciones = df_cliente['StockCode'].nunique()

        # mes frecuente
        mes_frecuente = df_cliente['InvoiceDate'].dt.month.mode()

        # Std compras total
        total_monto_compras_std = ((df_cliente['UnitPrice'] * df_cliente['quantity_buy']) - (df_cliente['UnitPrice'] * df_cliente['quantity_return'])).std()

        # Promedio dias entre compras
        fechas = df_cliente['InvoiceDate'].drop_duplicates().sort_values()
        if len(fechas) > 1:
            diffs = fechas.diff().dropna()
            promedio_dias_entre_compras = diffs.mean().days
        else:
            promedio_dias_entre_compras = np.nan

        # Churn
        churn = 0 if customer in customer_churn else 1

        ventanas.append({
            'CustomerID': customer,
            'ventana_id': ventana_id,
            'total_compras': total_compras,
            'total_productos_comprados': total_productos_comprados,
            'total_gasto': total_gasto,
            'productos_unicos_comprados': productos_unicos_comprados,
            'promedio_monto_factura': promedio_monto_factura,
            'promedio_monto_producto': promedio_monto_producto,
            'total_devoluciones': total_devoluciones,
            'total_productos_devoluciones': total_productos_devoluciones,
            'total_gasto_devoluciones': total_gasto_devoluciones,
            'productos_unicos_devoluciones': productos_unicos_devoluciones,
            'total_monto_compras_std': total_monto_compras_std if not pd.isna(total_monto_compras_std) else 0,
            'promedio_dias_entre_compras': promedio_dias_entre_compras,
            'mes_frecuente': mes_frecuente.iloc[0] if not mes_frecuente.empty else np.nan,
            'churn': churn
        })

    obs_ini = obs_ini + pd.DateOffset(months=meses_obs)
    ventana_id += 1

df_ventanas = pd.DataFrame(ventanas)
df_ventanas = pd.get_dummies(df_ventanas, columns=['mes_frecuente'], prefix='mes', dtype=int)
# df_ventanas['ventana_id'].value_counts()
df_ventanas


In [None]:
df_ventanas['ventana_id'].value_counts()

In [None]:
df_ventanas.info()

In [None]:
# Si estás usando tu DataFrame final con todas las features + churn
# Filtramos solo las columnas numéricas
df_corr = df_ventanas.select_dtypes(include='number')

# Calculamos la matriz de correlación
corr_matrix = df_corr.corr()

# Plot
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0, square=True, linewidths=0.5)

plt.title("Matriz de correlación entre variables", fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Seleccionar solo columnas numéricas (incluyendo churn)
df_numericas = df_ventanas.select_dtypes(include='number')

# Asegúrate que 'churn' esté como entero
df_numericas['churn'] = df_numericas['churn'].astype(int)

# Plot
sns.pairplot(df_numericas, hue='churn', corner=True, diag_kind='kde', plot_kws={'alpha': 0.5})
plt.suptitle("Pairplot de todas las variables numéricas", y=1.02)
plt.show()

In [None]:
# Seleccionar columnas numéricas (excluyendo 'ventana_id' si no es feature)
# df_hist = df_ventanas[df_ventanas['ventana_id'] == 1].copy()
df_hist = df_ventanas.copy()
df_hist = df_hist.select_dtypes(include='number').drop(columns=['ventana_id'], errors='ignore')

# Crear subplots
num_cols = df_hist.shape[1]
num_rows = (num_cols + 2) // 3  # 3 columnas por fila

plt.figure(figsize=(15, num_rows * 4))

for i, col in enumerate(df_hist.columns, 1):
    plt.subplot(num_rows, 3, i)
    plt.hist(df_hist[col].dropna(), bins=30, color='skyblue', edgecolor='black')
    plt.title(f'Distribución de {col}')
    plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
df_ventanas.info()

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

df_train = df_ventanas[df_ventanas['ventana_id'] == 1].copy()
df_val   = df_ventanas[df_ventanas['ventana_id'] == 2].copy()
df_test  = df_ventanas[df_ventanas['ventana_id'] == 3].copy()

features = df_ventanas.drop(columns=['CustomerID', 'ventana_id', 'churn'], errors='ignore').select_dtypes(include='number').columns.tolist()
target = 'churn'

X_train, y_train = df_train[features], df_train[target]
X_val, y_val     = df_val[features], df_val[target]

In [None]:
ratio = (y_train == 0).sum() / (y_train == 1).sum()

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=ratio,
    random_state=42
)

# Entrenar
model.fit(X_train, y_train)

# Evaluar en validación
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

print("Classification Report:")
print(classification_report(y_val, y_pred))

print("ROC AUC Score:", roc_auc_score(y_val, y_proba))

In [None]:
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Lista de modelos con diferentes parámetros
param_list = [
    {'max_depth': 3, 'learning_rate': 0.1},
    {'max_depth': 5, 'learning_rate': 0.1},
    {'max_depth': 3, 'learning_rate': 0.3},
    {'max_depth': 5, 'learning_rate': 0.3},
]

mejores_params = None
mejor_auc = 0

for params in param_list:
    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        **params
    )
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_proba)
    print(f"Params {params} - AUC: {auc:.4f}")

    if auc > mejor_auc:
        mejor_auc = auc
        mejores_params = params

In [None]:
# Entrenar con mejores parámetros
final_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    **mejores_params
)
X_trainval = pd.concat([df_train, df_val])[features]
y_trainval = pd.concat([df_train, df_val])[target]

final_model.fit(X_trainval, y_trainval)

# Test
X_test = df_test[features]
y_test = df_test[target]
y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

print("Reporte en Test:")
print(classification_report(y_test, y_pred))
print("ROC AUC Test:", roc_auc_score(y_test, y_proba))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# XGBOOST CON GRIDSEARCH
df_trainval = (
    df_ventanas
    .loc[df_ventanas['ventana_id'].between(1, 6)]
    .sort_values(['ventana_id'])   # ① orden temporal
    .reset_index(drop=True)
)

features = (
    df_trainval
    .drop(columns=['CustomerID', 'ventana_id', 'churn'], errors='ignore')
    .select_dtypes(include='number')
    .columns
    .tolist()
)

X = df_trainval[features]
y = df_trainval['churn']

In [None]:
from collections import Counter
# Calcular scale_pos_weight
# -----------------------------------------
counter = Counter(y)
neg, pos = counter[0], counter[1]
scale_pos_weight = neg / pos
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBClassifier

tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, scale_pos_weight, scale_pos_weight * 1.5],
}

xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=tscv,
    n_jobs=-1,
    verbose=2
)

grid.fit(X, y)

In [None]:
print("Mejores hiperparámetros:")
print(grid.best_params_)
print("Mejor AUC en validación cruzada:")
print(grid.best_score_)

In [None]:
mejores_params = grid.best_params_

final_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    **mejores_params
)

# Entrenar con train+val (por ejemplo usando ventanas 1–6)
final_model.fit(X, y)

In [None]:
df_test = df_ventanas[df_ventanas['ventana_id'] == 7].copy()
X_test = df_test[features]
y_test = df_test[target]

In [None]:
y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]  # Para ROC AUC

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print("Matriz de Confusión:")
print(cm)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC AUC Score
auc = roc_auc_score(y_test, y_proba)
print("\nROC AUC Score:", auc)


In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Churn", "Churn"], yticklabels=["No Churn", "Churn"])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.show()