<a href="https://colab.research.google.com/github/MauricioHM-git/fraccionamiento_transaccional/blob/main/Prueba_tecnica_nequi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🚀 Instalar RAPIDS cuML para entorno CUDA 12.x (como A100 en Colab)
!pip install --upgrade --no-cache-dir cuml-cu12 cupy-cuda12x rmm-cu12 --extra-index-url=https://pypi.nvidia.com


Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [None]:
import cuml
print("cuML versión:", cuml.__version__)

cuML versión: 25.06.00


In [None]:
# ==============================
# 🔹 BLOQUE 1: LIBRERÍAS Y CONFIGURACIÓN
# ==============================
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, ConfusionMatrixDisplay
)
import joblib

# 🚀 Librerías GPU
from cuml.linear_model import LogisticRegression as cuLogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from xgboost import XGBClassifier
from cuml.linear_model import LinearRegression as cuLinearRegression
from cuml.ensemble import RandomForestRegressor as cuRandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from itertools import product

# Confirmar GPU
import cuml
print("GPU disponible:", cuml.__version__)


GPU disponible: 25.06.00


In [None]:

# ==============================
# 🔹 BLOQUE 2: DESCARGA Y CARGA DE DATOS
# ==============================
parquet6 = "https://nequi-data.s3.us-east-1.amazonaws.com/sandbox_co/mscarmon/prueba_seleccion_ds/sample_data_0006_part_00.parquet"
parquet7 = "https://nequi-data.s3.us-east-1.amazonaws.com/sandbox_co/mscarmon/prueba_seleccion_ds/sample_data_0007_part_00.parquet"

os.makedirs("sample_data_parquet", exist_ok=True)
os.makedirs("models", exist_ok=True)

df1 = pd.read_parquet(parquet6)
df2 = pd.read_parquet(parquet7)

df1.to_parquet("sample_data_parquet/sample_data_0006_part_00.parquet", index=False)
df2.to_parquet("sample_data_parquet/sample_data_0007_part_00.parquet", index=False)

df = pd.concat([df1, df2], ignore_index=True)
print("Datos cargados:", df.shape)
print("Columnas:", df.columns.tolist())




Datos cargados: (21516918, 8)
Columnas: ['merchant_id', '_id', 'subsidiary', 'transaction_date', 'account_number', 'user_id', 'transaction_amount', 'transaction_type']


In [None]:
# ==============================
# 🔹 BLOQUE 3: LIMPIEZA Y PREPROCESAMIENTO
# ==============================
df["transaction_date"] = pd.to_datetime(df["transaction_date"])
df = df.sort_values("transaction_date").reset_index(drop=True)
df = df.drop_duplicates(subset="_id")
df = df.dropna(subset=["transaction_amount", "transaction_type"])

le_type = LabelEncoder()
df["transaction_type_encoded"] = le_type.fit_transform(df["transaction_type"])

df["year"] = df["transaction_date"].dt.year
df["month"] = df["transaction_date"].dt.month
df["day"] = df["transaction_date"].dt.day
df["dayofweek"] = df["transaction_date"].dt.dayofweek

features = [
    "merchant_id", "subsidiary", "account_number", "user_id",
    "year", "month", "day", "dayofweek"
]

for col in ["merchant_id", "subsidiary", "account_number", "user_id"]:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

X = df[features]
y_class = df["transaction_type_encoded"]
y_reg = df["transaction_amount"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
# ==============================
# 🔹 BLOQUE 4: DIVISIÓN TEMPORAL
# ==============================
split_index = int(len(df) * 0.8)
X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
y_class_train, y_class_test = y_class[:split_index], y_class[split_index:]
y_reg_train, y_reg_test = y_reg[:split_index], y_reg[split_index:]



In [None]:
# ==============================
# 🔹 BLOQUE 5: MODELOS DE CLASIFICACIÓN (GPU)
# ==============================
models_class = {
    "LogReg": cuLogisticRegression(max_iter=500),
    "RF": cuRandomForestClassifier(),
    "XGB": XGBClassifier(
        eval_metric="logloss",
        tree_method="gpu_hist",       # ✅ Entrenamiento GPU
        predictor="gpu_predictor",    # ✅ Predicción GPU
        use_label_encoder=False
    )
}

params_class = {
    "LogReg": {"C": [0.1, 1, 10]},
    "RF": {"n_estimators": [100, 200], "max_depth": [5, 10]},
    "XGB": {"n_estimators": [100, 200], "max_depth": [3, 6]}
}

best_class_model = None
best_class_score = 0
results_class = []

# ==============================
# 🔹 BLOQUE 5.1: BÚSQUEDA DE HIPERPARÁMETROS GPU
# ==============================
for model_name, model in models_class.items():
    print(f"\n🔍 Entrenando modelo {model_name} con búsqueda en GPU...")

    param_grid = params_class[model_name]
    best_local_score = 0
    best_local_model = None

    # Búsqueda manual simple (GPU)
    from itertools import product
    for combo in product(*param_grid.values()):
        params = dict(zip(param_grid.keys(), combo))
        m = model.__class__(**params)
        m.fit(X_train, y_class_train)

        preds = m.predict(X_test)
        score = f1_score(y_class_test, preds, average="macro")

        if score > best_local_score:
            best_local_score = score
            best_local_model = m

    print(f"✅ Mejor F1 {model_name}: {best_local_score:.4f}")
    results_class.append((model_name, best_local_score))

    if best_local_score > best_class_score:
        best_class_score = best_local_score
        best_class_model = best_local_model

print("\n🏆 Mejor modelo global:", best_class_model)



🔍 Entrenando modelo LogReg con búsqueda en GPU...
✅ Mejor F1 LogReg: 0.4466

🔍 Entrenando modelo RF con búsqueda en GPU...
✅ Mejor F1 RF: 0.4466

🔍 Entrenando modelo XGB con búsqueda en GPU...
✅ Mejor F1 XGB: 0.9918

🏆 Mejor modelo global: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, ...)


In [None]:
# ==============================
# 🔹 BLOQUE 6: BÚSQUEDA DE HIPERPARÁMETROS GPU
# ==============================

models_reg = {
    "LinReg": cuLinearRegression(),
    "RF": cuRandomForestRegressor(),
    "XGB": XGBRegressor(
        tree_method="gpu_hist",       # ✅ Entrenamiento GPU
        predictor="gpu_predictor",    # ✅ Predicción GPU
        eval_metric="rmse"
    )
}

params_reg = {
    "LinReg": {},  # sin hiperparámetros principales
    "RF": {"n_estimators": [100, 200], "max_depth": [5, 10]},
    "XGB": {"n_estimators": [100, 200], "max_depth": [3, 6]}
}

best_reg_model = None
best_reg_score = -np.inf
results_reg = []

# ==============================
# 🔹 BLOQUE 6.1: BÚSQUEDA DE HIPERPARÁMETROS GPU
# ==============================
for model_name, model in models_reg.items():
    print(f"\n🔍 Entrenando modelo de regresión {model_name} con búsqueda en GPU...")

    param_grid = params_reg[model_name]
    best_local_score = -np.inf
    best_local_model = None

    # Búsqueda manual simple (GPU)
    if len(param_grid) == 0:
        # Modelos sin hiperparámetros, solo entrenar
        model.fit(X_train, y_reg_train)
        preds = model.predict(X_test)
        score = r2_score(y_reg_test, preds)
        best_local_score = score
        best_local_model = model
    else:
        for combo in product(*param_grid.values()):
            params = dict(zip(param_grid.keys(), combo))
            m = model.__class__(**params)
            m.fit(X_train, y_reg_train)

            preds = m.predict(X_test)
            score = r2_score(y_reg_test, preds)

            if score > best_local_score:
                best_local_score = score
                best_local_model = m

    print(f"✅ Mejor R² {model_name}: {best_local_score:.4f}")
    results_reg.append((model_name, best_local_score))

    if best_local_score > best_reg_score:
        best_reg_score = best_local_score
        best_reg_model = best_local_model

print("\n🏆 Mejor modelo global:", best_reg_model)




🔍 Entrenando modelo de regresión LinReg con búsqueda en GPU...
✅ Mejor R² LinReg: -0.0041

🔍 Entrenando modelo de regresión RF con búsqueda en GPU...
✅ Mejor R² RF: 0.0021

🔍 Entrenando modelo de regresión XGB con búsqueda en GPU...
✅ Mejor R² XGB: 0.0124

🏆 Mejor modelo global: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=200,
             n_jobs=None, nu

In [None]:
# ==============================
# 🔹 BLOQUE 7: PIPELINE DE PREDICCIÓN FINAL
# ==============================
def predict_pipeline(X_new: pd.DataFrame):
    """
    Recibe un DataFrame con las columnas de características
    y devuelve las predicciones consolidadas:
    - transaction_type_pred: 'CREDITO' o 'DEBITO'
    - transaction_amount_pred: monto estimado (0 si no hay transacción)
    """
    X_proc = X_new.copy()
    for col in ["merchant_id", "subsidiary", "account_number", "user_id"]:
        X_proc[col] = LabelEncoder().fit_transform(X_proc[col].astype(str))

    X_proc["year"] = pd.to_datetime(X_proc["transaction_date"]).dt.year
    X_proc["month"] = pd.to_datetime(X_proc["transaction_date"]).dt.month
    X_proc["day"] = pd.to_datetime(X_proc["transaction_date"]).dt.day
    X_proc["dayofweek"] = pd.to_datetime(X_proc["transaction_date"]).dt.dayofweek

    X_final = scaler.transform(X_proc[features])

    # Paso 1: Clasificación
    y_class_pred = best_class_model.predict(X_final)
    y_type_pred = le_type.inverse_transform(y_class_pred)

    # Paso 2: Regresión condicional
    y_amount_pred = []
    for i, pred in enumerate(y_class_pred):
        if pred == 1:
            y_amount_pred.append(best_reg_model.predict(X_final[[i]])[0])
        else:
            y_amount_pred.append(0.0)

    # Consolidar resultados
    X_proc["transaction_type_pred"] = y_type_pred
    X_proc["transaction_amount_pred"] = y_amount_pred
    return X_proc[["_id", "transaction_type_pred", "transaction_amount_pred"]]


In [None]:
# ==============================================
# 🔹 BLOQUE 8:GUARDAR MODELOS Y TRANSFORMACIONES ENTRENADAS
# ==============================================

# Crear carpeta si no existe
os.makedirs("models", exist_ok=True)

# Guardar los objetos entrenados
joblib.dump(best_class_model, "models/modelo_clasificador.pkl")
joblib.dump(best_reg_model, "models/modelo_regresor.pkl")
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(le_type, "models/encoder.pkl")

print("✅ Modelos y transformadores guardados correctamente en sample_data_parquet/")


✅ Modelos y transformadores guardados correctamente en sample_data_parquet/


In [None]:
# ==============================================
# 🔹 COMPRIMIR CARPETAS
# ==============================================

# Nombre de la carpeta a comprimir
folder_name = "models"

# Verifica que exista la carpeta
if not os.path.exists(folder_name):
    print(f"❌ La carpeta '{folder_name}' no existe.")
else:
    # Nombre del archivo ZIP final
    zip_name = folder_name + ".zip"

    # Crea el archivo ZIP (sin eliminar la carpeta original)
    shutil.make_archive(folder_name, 'zip', folder_name)

    print(f"✅ Carpeta '{folder_name}' comprimida exitosamente como '{zip_name}'")

✅ Carpeta 'models' comprimida exitosamente como 'models.zip'


In [None]:
df

Unnamed: 0,merchant_id,_id,subsidiary,transaction_date,account_number,user_id,transaction_amount,transaction_type,transaction_type_encoded,year,month,day,dayofweek
0,2,4a101c7ee6f4f8f1fcdbd4bc044c59d8,16143,2021-01-01 00:00:40,1743489,574325,5.94445501,CREDITO,0,2021,1,1,4
1,1,47d6c9460c5c27d63d6e66d23e598695,7373,2021-01-01 00:01:08,2995774,980765,59.44455012,DEBITO,1,2021,1,1,4
2,1,8c2df593436163e9cbff2fe3050f0ae1,12758,2021-01-01 00:01:13,1993925,1693394,47.55564009,DEBITO,1,2021,1,1,4
3,2,6ed538998f04c804c299dc9fbc4d6e90,14718,2021-01-01 00:01:17,1242115,2514877,5.94445501,CREDITO,0,2021,1,1,4
4,2,8801042a2192cdb8d96074442e5d4e06,13981,2021-01-01 00:01:45,2327941,1293729,5.94445501,DEBITO,1,2021,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21516913,1,c6f1be268792b5ac30fc0e83da8c2fb8,6427,2021-11-30 23:58:47,1531137,192507,35.66673007,DEBITO,1,2021,11,30,1
21516914,1,6920e4ed0e5204a6e95feb6d8b435272,6925,2021-11-30 23:59:03,234781,1929169,118.88910024,DEBITO,1,2021,11,30,1
21516915,1,2872a531e4ce546a996e5590892d3ad1,16636,2021-11-30 23:59:28,1729070,1906748,59.44455012,DEBITO,1,2021,11,30,1
21516916,1,ee3498c05fa4125e5f81f97578551f7d,14371,2021-11-30 23:59:33,1182975,1299544,35.66673007,DEBITO,1,2021,11,30,1
