In [41]:
##### Librerias para manipulacion de datos ######

import pandas as pd
import numpy as np
import polars as pl
import sys
from pathlib import Path
import os

##### Librerias para gráficos ######

import matplotlib.pyplot as plt
import seaborn as sns

# Librerías especiales para machine learning

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

from joblib import dump, load

In [22]:
#Definimos la ruta principal del proyecto
main_path = Path(os.getcwd()).parent.parent

#Definimos rutas relativas
config_path = main_path / "Config"
data_path = main_path / "Data"
log_path = main_path / "Logs"

#Subrutas dentro de Data
input_data_path = data_path / "Input"
output_data_path = data_path / "Output"
other_data_path = data_path / "Other"

#Rutas de modelos
#models_path = other_data_path / "Models_Classification"

In [None]:
pl.scan_csv(input_data_path / "train_data.csv").sink_parquet(input_data_path / "train_data.parquet")
pl.scan_csv(input_data_path / "test_data.csv").sink_parquet(input_data_path / "test_data.parquet")

In [None]:
data = pl.scan_parquet(input_data_path / "train_data.parquet")
labels = pl.read_csv(input_data_path / "train_labels.csv")

In [25]:
data = (
    data
    .with_columns(
        pl.col("S_2").str.strptime(pl.Date, format="%Y-%m-%d")
    )
    .sort(["customer_ID", "S_2"])
)

In [26]:
cat_cols = [
    "B_30", "B_38", "D_114", "D_116", "D_117",
    "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"
]

In [27]:
# Obtenemos todas las columnas del parquet
all_cols = data.collect_schema().names()

# Columnas que no son features
id_cols = ["customer_ID", "S_2"]

num_cols = [c for c in all_cols if c not in id_cols + cat_cols]

In [28]:
#Aqui guardamos las expresiones de agregacion para las columnas numericas

agg_num_exprs = []

for col in num_cols:
    agg_num_exprs.extend([
        pl.col(col).last().alias(f"{col}_last"),
        pl.col(col).mean().alias(f"{col}_mean"),
        pl.col(col).min().alias(f"{col}_min"),
        pl.col(col).max().alias(f"{col}_max"),
        pl.col(col).std().alias(f"{col}_std"),
        pl.col(col).count().alias(f"{col}_count"),  # mismo valor para todas, pero útil
    ])

In [29]:
#Aqui guardamos las expresiones de agregacion para las columnas categoricas

agg_cat_exprs = []

for col in cat_cols:
    agg_cat_exprs.extend([
        pl.col(col).last().alias(f"{col}_last"),
        pl.col(col).mode().alias(f"{col}_mode"),
        pl.col(col).n_unique().alias(f"{col}_nunique"),
    ])

In [30]:
# Algunas versiones de polars no exponen groupby en LazyFrame;
# materializamos primero y luego agrupamos en memoria.
train_features = (
    data
    .group_by("customer_ID")              # <-- aquí el cambio
    .agg(agg_num_exprs + agg_cat_exprs)
    .collect()                            # materializas el DF final
)

In [31]:
train_features.write_parquet(other_data_path / "train_features.parquet")

In [33]:
train_data = train_features.join(labels, on="customer_ID", how="inner")

In [34]:
train_data.write_parquet(other_data_path / "train_data.parquet")

In [37]:
train_data = pd.read_parquet(other_data_path / "train_data.parquet")

In [47]:
X = train_data.drop(columns=["customer_ID", "target"])
y = train_data["target"]

In [48]:
object_cols = X.select_dtypes(include=["object"]).columns
len(object_cols), object_cols

(118,
 Index(['D_49_last', 'D_49_mean', 'D_49_min', 'D_49_max', 'D_49_std',
        'D_73_last', 'D_73_mean', 'D_73_min', 'D_73_max', 'D_73_std',
        ...
        'D_116_mode', 'D_117_mode', 'D_120_mode', 'D_126_mode', 'D_63_last',
        'D_63_mode', 'D_64_last', 'D_64_mode', 'D_66_mode', 'D_68_mode'],
       dtype='object', length=118))

In [49]:
for col in object_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [51]:
n_pos = y_train.sum()
n_neg = len(y_train) - n_pos
scale_pos_weight = n_neg / n_pos  # puedes imprimirlo para ver cuánto es

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",      # importante para que sea rápido
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,  # o coméntalo para el primer intento
    random_state=42,
)

model.fit(X_train, y_train)

preds_valid = model.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, preds_valid)
print("AUC valid:", auc)

AUC valid: 0.9614713523776364


In [53]:

importance = model.get_booster().get_score(importance_type="gain")
imp_df = pd.DataFrame({
    "feature": list(importance.keys()),
    "gain": list(importance.values())
}).sort_values("gain", ascending=False)

imp_df.head(30)

Unnamed: 0,feature,gain
0,P_2_last,5566.203613
1,P_2_mean,2943.00293
10,B_1_last,1766.660889
2,P_2_min,1088.468994
126,B_9_last,668.569763
162,B_11_last,659.614197
15,B_2_last,560.092041
43,D_42_min,367.834778
21,R_1_mean,361.364075
40,B_3_std,356.277924


In [54]:
importance_w = model.get_booster().get_score(importance_type="weight")
importance_c = model.get_booster().get_score(importance_type="cover")

In [55]:
importance_w

{'P_2_last': 184.0,
 'P_2_mean': 83.0,
 'P_2_min': 84.0,
 'P_2_max': 60.0,
 'P_2_std': 31.0,
 'D_39_last': 167.0,
 'D_39_mean': 20.0,
 'D_39_min': 19.0,
 'D_39_max': 53.0,
 'D_39_std': 56.0,
 'B_1_last': 63.0,
 'B_1_mean': 17.0,
 'B_1_min': 14.0,
 'B_1_max': 14.0,
 'B_1_std': 18.0,
 'B_2_last': 65.0,
 'B_2_mean': 28.0,
 'B_2_min': 29.0,
 'B_2_max': 18.0,
 'B_2_std': 16.0,
 'R_1_last': 83.0,
 'R_1_mean': 32.0,
 'R_1_min': 20.0,
 'R_1_max': 34.0,
 'R_1_std': 41.0,
 'S_3_last': 90.0,
 'S_3_mean': 73.0,
 'S_3_min': 34.0,
 'S_3_max': 55.0,
 'S_3_std': 30.0,
 'S_3_count': 3.0,
 'D_41_last': 65.0,
 'D_41_mean': 16.0,
 'D_41_min': 23.0,
 'D_41_max': 16.0,
 'D_41_std': 32.0,
 'B_3_last': 93.0,
 'B_3_mean': 9.0,
 'B_3_min': 12.0,
 'B_3_max': 16.0,
 'B_3_std': 43.0,
 'D_42_last': 41.0,
 'D_42_mean': 57.0,
 'D_42_min': 46.0,
 'D_42_max': 46.0,
 'D_42_std': 10.0,
 'D_42_count': 2.0,
 'D_43_last': 67.0,
 'D_43_mean': 63.0,
 'D_43_min': 29.0,
 'D_43_max': 33.0,
 'D_43_std': 36.0,
 'D_43_count': 3.0,


In [56]:
importance_c

{'P_2_last': 19534.0546875,
 'P_2_mean': 8083.716796875,
 'P_2_min': 7235.60693359375,
 'P_2_max': 4678.77490234375,
 'P_2_std': 1459.91259765625,
 'D_39_last': 8058.2275390625,
 'D_39_mean': 4024.274169921875,
 'D_39_min': 493.69439697265625,
 'D_39_max': 4710.67529296875,
 'D_39_std': 6069.63671875,
 'B_1_last': 15373.662109375,
 'B_1_mean': 2734.536865234375,
 'B_1_min': 1452.668701171875,
 'B_1_max': 1430.0450439453125,
 'B_1_std': 1691.861328125,
 'B_2_last': 7768.28662109375,
 'B_2_mean': 5689.0166015625,
 'B_2_min': 5017.37451171875,
 'B_2_max': 1209.4345703125,
 'B_2_std': 3749.94970703125,
 'R_1_last': 9170.275390625,
 'R_1_mean': 7434.2978515625,
 'R_1_min': 1083.301025390625,
 'R_1_max': 1142.41943359375,
 'R_1_std': 4326.25927734375,
 'S_3_last': 4356.65576171875,
 'S_3_mean': 5135.7978515625,
 'S_3_min': 2571.485107421875,
 'S_3_max': 2455.6435546875,
 'S_3_std': 5004.81884765625,
 'S_3_count': 13845.60546875,
 'D_41_last': 6752.92724609375,
 'D_41_mean': 807.6165161132812