In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore")

# Load data
csv_path1 = 'C:/Users/games/OneDrive/Desktop/heybanco/data/heybanco/base_clientes_final.csv'
csv_path2 = 'C:/Users/games/OneDrive/Desktop/heybanco/data/heybanco/base_transacciones_final.csv'
df_clientes = pd.read_csv(csv_path1, parse_dates=["fecha_nacimiento", "fecha_alta"])
df_tx = pd.read_csv(csv_path2, parse_dates=["fecha"])

# Group comercios into categories
def group_merchant(comercio):
    comercio = comercio.upper()
    if comercio in ['NETFLIX', 'SPOTIFY', 'DISNEY PLUS', 'AMAZON PRIME', 'GOOGLE YOUTUBEPREMIUM', 'CRUNCHYROLL', 'VIX', 'APPLE', 'GOOGLE AMAZON MOBILE', 'AUDIBLE', 'ROKU', 'GOOGLE YOUTUBE']:
        return 'ENTERTAINMENT'
    elif comercio in ['UBER', 'UBER EATS', 'DIDI', 'DIDI RIDES', 'DIDI FOOD', 'DIDIFOOD', 'METROBUS']:
        return 'TRANSPORT/DELIVERY'
    elif comercio in ['WALMART', 'SORIANA', 'HEB', 'SUPERCENTER', 'COSTCO', 'SAMS CLUB', 'CHEDRAUI', 'ALSUPER', 'SUPERAMA', 'WAL-MART']:
        return 'GROCERIES'
    elif comercio in ['CFE', 'TELMEX', 'TELCEL', 'IZZI', 'TOTALPLAY', 'TOTAL PLAY', 'MI ATT', 'ATT', 'AT&T', 'MEGACABLE', 'TELEFONICA', 'CABLEYCOMUN', 'RENTAMOVISTAR']:
        return 'UTILITIES'
    elif comercio in ['OXXO', '7 ELEVEN', '7ELEVEN', 'OXXO GAS', 'COSTCO GAS']:
        return 'CONVENIENCE'
    elif comercio in ['FARMACIAS DEL AHORRO', 'FARMACIAS GUADALAJARA', 'FARMACIAS SIMILARES']:
        return 'PHARMACY'
    elif comercio in ['LIVERPOOL', 'SEARS', 'COPPEL', 'MELIMAS']:
        return 'RETAIL'
    elif comercio in ['SMARTFIT', 'SMART FIT', 'CRUNCHYROLL']:
        return 'FITNESS'
    elif comercio in ['CARLS JR', 'STARBUCKS']:
        return 'RESTAURANTS'
    elif comercio in ['OPENAI', 'GOOGLE', 'ITUNES', 'FACEBOOK', 'ADOBE', 'CANVA', 'MICROSOFT', 'PLAYSTATION NETWORK']:
        return 'DIGITAL SERVICES'
    elif comercio in ['MERCADO PAGO', 'MERCADOPAGO', 'RAPPI', 'RAPPIPRO', 'SOFT RAPPI']:
        return 'ECOMMERCE'
    elif comercio in ['ALLIANZ MEXICO', 'KUESKI PAY', 'TOTAL PASS', 'BAE', 'APLAZO', 'APLAZ']:
        return 'FINTECH/INSURANCE'
    elif comercio in ['UNDOSTRES', 'TULOTERO', 'CALIENTE', 'BET365']:
        return 'GAMBLING'
    elif comercio in ['ROTOPLAS', 'URBANI', 'SMART']:
        return 'OTHER'
    else:
        return 'OTHER'

df_tx["categoria"] = df_tx["comercio"].apply(group_merchant)

# Weekly features
df_tx["week"] = df_tx["fecha"].dt.to_period("W").apply(lambda r: r.start_time)
weekly_spend = df_tx.groupby(["id", "week", "categoria"])["monto"].sum().reset_index()
weekly_spend_pivot = weekly_spend.pivot_table(index=["id", "week"], columns="categoria", values="monto", fill_value=0).reset_index()
weekly_spend_pivot = weekly_spend_pivot.sort_values(["id", "week"])

# Create rolling features
observation_weeks = 4
label_week_gap = 1
dfs = []
for client_id, group in weekly_spend_pivot.groupby("id"):
    group = group.reset_index(drop=True)
    for i in range(len(group) - observation_weeks - label_week_gap):
        obs = group.iloc[i:i+observation_weeks]
        label = group.iloc[i+observation_weeks+label_week_gap-1]

        # Basic features
        features = obs.drop(columns=["id", "week"]).mean().to_dict()

        # Additional engineered features
        features.update({f"sum_{col}": obs[col].sum() for col in obs.columns if col not in ["id", "week"]})
        features.update({f"max_{col}": obs[col].max() for col in obs.columns if col not in ["id", "week"]})
        features.update({f"trend_{col}": obs[col].iloc[-1] - obs[col].iloc[0] for col in obs.columns if col not in ["id", "week"]})

        features["id"] = client_id
        features["week"] = group.iloc[i+observation_weeks-1]["week"]

        labels = (label.drop(["id", "week"]) > 0).astype(int)
        for cat in labels.index:
            features[f"label_{cat}"] = labels[cat]

        dfs.append(features)

df_final = pd.DataFrame(dfs)

# Split features/labels
label_cols = [c for c in df_final.columns if c.startswith("label_")]
X = df_final.drop(columns=label_cols + ["id", "week"])
Y = df_final[label_cols]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Pipeline with XGBoost
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), numeric_features)
])

xgb_model = OneVsRestClassifier(
    CalibratedClassifierCV(
        XGBClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ), method="sigmoid"
    )
)

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", xgb_model)
])

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=[c.replace("label_", "") for c in label_cols]))

=== Classification Report ===
                    precision    recall  f1-score   support

       CONVENIENCE       0.70      0.63      0.66      3716
  DIGITAL SERVICES       0.64      0.50      0.56      1596
         ECOMMERCE       0.63      0.39      0.48      2331
     ENTERTAINMENT       0.70      0.60      0.65      3421
 FINTECH/INSURANCE       0.62      0.37      0.46       314
           FITNESS       0.52      0.18      0.27        62
          GAMBLING       0.81      0.65      0.72       534
         GROCERIES       0.64      0.41      0.50      2909
             OTHER       0.65      0.65      0.65      4420
          PHARMACY       0.62      0.16      0.26      1355
       RESTAURANTS       0.60      0.12      0.20       768
            RETAIL       0.70      0.26      0.37       990
TRANSPORT/DELIVERY       0.77      0.73      0.75      3848
         UTILITIES       0.61      0.40      0.48      3071

         micro avg       0.68      0.52      0.59     29335
        

In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore")

# Load data
csv_path1 = 'C:/Users/games/OneDrive/Desktop/heybanco/data/heybanco/base_clientes_final.csv'
csv_path2 = 'C:/Users/games/OneDrive/Desktop/heybanco/data/heybanco/base_transacciones_final.csv'
df_clientes = pd.read_csv(csv_path1, parse_dates=["fecha_nacimiento", "fecha_alta"])
df_tx = pd.read_csv(csv_path2, parse_dates=["fecha"])

# Agrupar comercios en categorías
def group_merchant(comercio):
    comercio = comercio.upper()
    if comercio in ['NETFLIX', 'SPOTIFY', 'DISNEY PLUS', 'AMAZON PRIME', 'GOOGLE YOUTUBEPREMIUM', 'CRUNCHYROLL', 'VIX', 'APPLE', 'GOOGLE AMAZON MOBILE', 'AUDIBLE', 'ROKU', 'GOOGLE YOUTUBE']:
        return 'ENTERTAINMENT'
    elif comercio in ['UBER', 'UBER EATS', 'DIDI', 'DIDI RIDES', 'DIDI FOOD', 'DIDIFOOD', 'METROBUS']:
        return 'TRANSPORT/DELIVERY'
    elif comercio in ['WALMART', 'SORIANA', 'HEB', 'SUPERCENTER', 'COSTCO', 'SAMS CLUB', 'CHEDRAUI', 'ALSUPER', 'SUPERAMA', 'WAL-MART']:
        return 'GROCERIES'
    elif comercio in ['CFE', 'TELMEX', 'TELCEL', 'IZZI', 'TOTALPLAY', 'TOTAL PLAY', 'MI ATT', 'ATT', 'AT&T', 'MEGACABLE', 'TELEFONICA', 'CABLEYCOMUN', 'RENTAMOVISTAR']:
        return 'UTILITIES'
    elif comercio in ['OXXO', '7 ELEVEN', '7ELEVEN', 'OXXO GAS', 'COSTCO GAS']:
        return 'CONVENIENCE'
    elif comercio in ['FARMACIAS DEL AHORRO', 'FARMACIAS GUADALAJARA', 'FARMACIAS SIMILARES']:
        return 'PHARMACY'
    elif comercio in ['LIVERPOOL', 'SEARS', 'COPPEL', 'MELIMAS']:
        return 'RETAIL'
    elif comercio in ['SMARTFIT', 'SMART FIT', 'CRUNCHYROLL']:
        return 'FITNESS'
    elif comercio in ['CARLS JR', 'STARBUCKS']:
        return 'RESTAURANTS'
    elif comercio in ['OPENAI', 'GOOGLE', 'ITUNES', 'FACEBOOK', 'ADOBE', 'CANVA', 'MICROSOFT', 'PLAYSTATION NETWORK']:
        return 'DIGITAL SERVICES'
    elif comercio in ['MERCADO PAGO', 'MERCADOPAGO', 'RAPPI', 'RAPPIPRO', 'SOFT RAPPI']:
        return 'ECOMMERCE'
    elif comercio in ['ALLIANZ MEXICO', 'KUESKI PAY', 'TOTAL PASS', 'BAE', 'APLAZO', 'APLAZ']:
        return 'FINTECH/INSURANCE'
    elif comercio in ['UNDOSTRES', 'TULOTERO', 'CALIENTE', 'BET365']:
        return 'GAMBLING'
    elif comercio in ['ROTOPLAS', 'URBANI', 'SMART']:
        return 'OTHER'
    else:
        return 'OTHER'

df_tx["categoria"] = df_tx["comercio"].apply(group_merchant)

# Weekly features
df_tx["week"] = df_tx["fecha"].dt.to_period("W").apply(lambda r: r.start_time)
weekly_spend = df_tx.groupby(["id", "week", "categoria"])["monto"].sum().reset_index()
weekly_spend_pivot = weekly_spend.pivot_table(index=["id", "week"], columns="categoria", values="monto", fill_value=0).reset_index()
weekly_spend_pivot = weekly_spend_pivot.sort_values(["id", "week"])

# Rolling features
observation_weeks = 4
label_week_gap = 1
dfs = []
for client_id, group in weekly_spend_pivot.groupby("id"):
    group = group.reset_index(drop=True)
    for i in range(len(group) - observation_weeks - label_week_gap):
        obs = group.iloc[i:i+observation_weeks]
        label = group.iloc[i+observation_weeks+label_week_gap-1]

        features = obs.drop(columns=["id", "week"]).mean().to_dict()
        features.update({f"sum_{col}": obs[col].sum() for col in obs.columns if col not in ["id", "week"]})
        features.update({f"max_{col}": obs[col].max() for col in obs.columns if col not in ["id", "week"]})
        features.update({f"trend_{col}": obs[col].iloc[-1] - obs[col].iloc[0] for col in obs.columns if col not in ["id", "week"]})

        features["id"] = client_id
        features["week"] = group.iloc[i+observation_weeks-1]["week"]

        labels = (label.drop(["id", "week"]) > 0).astype(int)
        for cat in labels.index:
            features[f"label_{cat}"] = labels[cat]

        dfs.append(features)

df_final = pd.DataFrame(dfs)

# Split features and labels
label_cols = [c for c in df_final.columns if c.startswith("label_")]
X = df_final.drop(columns=label_cols + ["id", "week"])
Y = df_final[label_cols]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Preprocessing pipeline
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), numeric_features)
])

# XGBoost Classifier with calibration
xgb_base = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# One-vs-Rest Classifier + CalibratedClassifierCV (sigmoid)
classifier = OneVsRestClassifier(
    CalibratedClassifierCV( method='sigmoid', cv=3)
)

# Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", classifier)
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Report
print("=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=[c.replace("label_", "") for c in label_cols]))


=== Classification Report ===
                    precision    recall  f1-score   support

       CONVENIENCE       0.76      0.42      0.54      3716
  DIGITAL SERVICES       0.66      0.10      0.18      1596
         ECOMMERCE       0.73      0.19      0.30      2331
     ENTERTAINMENT       0.68      0.20      0.30      3421
 FINTECH/INSURANCE       0.57      0.16      0.25       314
           FITNESS       0.44      0.13      0.20        62
          GAMBLING       0.85      0.24      0.37       534
         GROCERIES       0.68      0.23      0.34      2909
             OTHER       0.71      0.32      0.44      4420
          PHARMACY       0.56      0.06      0.11      1355
       RESTAURANTS       0.61      0.09      0.15       768
            RETAIL       0.36      0.01      0.01       990
TRANSPORT/DELIVERY       0.83      0.56      0.67      3848
         UTILITIES       0.54      0.09      0.15      3071

         micro avg       0.73      0.26      0.39     29335
        