In [20]:
from dmeyf2025.modelling.train_model import train_models
from dmeyf2025.experiments.experiments import load_config
from dmeyf2025.etl import ETL
from dmeyf2025.processors.target_processor import BinaryTargetProcessor, CreateTargetProcessor
from dmeyf2025.processors.sampler import SamplerProcessor
from dmeyf2025.processors.feature_processors import DeltaLagTransformer
from dmeyf2025.metrics.revenue import lgb_gan_eval, revenue_from_prob
from run_experiment import get_features
import json
import os
import numpy as np
import lightgbm as lgb
import pandas as pd



# Config

In [None]:
experiment_path = "experiments/delta-lags-percentile_0.1_150_03-04_G_Base_0.0.6"
config = load_config(os.path.join(experiment_path, "config.yaml"))



In [22]:
train_months = config["data"]["train_months"]
eval_month = config["data"]["eval_month"]
positive_class = config["experiment"]["positive_classes"]
seeds = config["experiment"]["seeds"]
data_path = config["experiment"]["raw_data_path"]
sample_ratio = config["experiment"]["sample_ratio"]
test_months = 202105


# Processing

In [23]:
etl = ETL(data_path, CreateTargetProcessor(),
train_months = [202101, 202102, 202103, 202104, 202105, 202106],)
X, y, _,_,_,_ = etl.execute_complete_pipeline()
target_processor = BinaryTargetProcessor(positive_class)
X, y = target_processor.fit_transform(X, y)

19:58:05 | INFO | Iniciando pipeline ETL completo...
19:58:08 | INFO | Archivo leído exitosamente: 978439 filas, 155 columnas
19:58:12 | INFO | Procesamiento completado: 978439 filas, 155 columnas
19:58:13 | INFO | DataFrame train: 978439
19:58:13 | INFO | Pipeline ETL completado exitosamente!


In [24]:
X["target"] = y
X_transformed = get_features(X)
X_transformed.set_index("numero_de_cliente", inplace=True)


In [25]:
X_train = X_transformed[X_transformed["foto_mes"].isin(train_months)]
X_eval = X_transformed[X_transformed["foto_mes"].isin([eval_month])]
X_test = X_transformed[X_transformed["foto_mes"].isin([202105])] # Solo para ver el escalado
y_train = X_train["target"]
X_train = X_train.drop(columns=["target"])
X_eval = X_eval.drop(columns=["target"])

In [26]:
sampler_processor = SamplerProcessor(0.1, random_state=seeds[0])
X_train_sampled, y_train_sampled = sampler_processor.fit_transform(X_train, y_train)


19:58:41 | INFO | ✅ Dataset final: 36320 registros
19:58:41 | INFO |    - Clase positiva: 4075
19:58:41 | INFO |    - Clase negativa: 32245


# Train

In [27]:
#load hyperparams
params = json.load(open(os.path.join(experiment_path, "best_params.json")))
params["min_data_in_leaf"] = int(params["min_data_in_leaf"]/(len(X_train_sampled)/len(X_train)))


In [28]:
preds, models= train_models(X_train, y_train, X_eval, params, seeds[:10], experiment_path)

19:58:42 | INFO | Training dataset shape: (326527, 713)
19:58:42 | INFO | Evaluating dataset shape: (164313, 713)
19:58:42 | INFO | Training final model with seed: 537919
19:58:56 | INFO | Training final model with seed: 923347
19:59:09 | INFO | Training final model with seed: 173629
19:59:23 | INFO | Training final model with seed: 419351
19:59:36 | INFO | Training final model with seed: 287887
19:59:48 | INFO | Training final model with seed: 41
20:00:01 | INFO | Training final model with seed: 42
20:00:14 | INFO | Training final model with seed: 43
20:00:27 | INFO | Training final model with seed: 44
20:00:41 | INFO | Training final model with seed: 45


# Scale eval

In [None]:
cpayroll_pivot = X.pivot(index="numero_de_cliente", columns="foto_mes", values="cpayroll_trx")
# Calcula el salto entre 202106 y 202105
cpayroll_pivot["jump_202106_202105"] = cpayroll_pivot[202106] - cpayroll_pivot[202105]
# Selecciona clientes con salto de 1 o más
clientes_salto = cpayroll_pivot[cpayroll_pivot["jump_202106_202105"] >= 1].index.tolist()

factors = {"cpayroll_trx": 2, "mcuentas_saldo": 2, "mpayroll": 1.6, "mtransferencias_recibidas": 1.6, "cextraccion_autoservicio": 1.5} # Esto salió de una exploración visual

X["target"] = y
X_transformed = get_features(X)
X_transformed.set_index("numero_de_cliente", inplace=True)

X_train = X_transformed[X_transformed["foto_mes"].isin(train_months)]
X_eval = X_transformed[X_transformed["foto_mes"].isin([eval_month])]
X_test = X_transformed[X_transformed["foto_mes"].isin([202105])] # Solo para ver el escalado
y_train = X_train["target"]
X_train = X_train.drop(columns=["target"])
X_eval = X_eval.drop(columns=["target"])



In [30]:
X_eval.loc[clientes_salto, factors.keys()] = X_eval.loc[clientes_salto, factors.keys()] / factors.values()
X_eval.loc[clientes_salto, "cpayroll_trx"] = np.ceil(X_eval.loc[clientes_salto, "cpayroll_trx"])

X_eval.loc[clientes_salto, "cextraccion_autoservicio"] = np.ceil(X_eval.loc[clientes_salto, "cextraccion_autoservicio"])


  X_eval.loc[clientes_salto, factors.keys()] = X_eval.loc[clientes_salto, factors.keys()] / factors.values()
  X_eval.loc[clientes_salto, factors.keys()] = X_eval.loc[clientes_salto, factors.keys()] / factors.values()


In [33]:
from dmeyf2025.modelling.train_model import predict_ensemble_model
preds = predict_ensemble_model(models, X_eval)

In [35]:
from dmeyf2025.modelling.train_model import prob_to_sends

experiment_path = "experiments"
experiment_folder = "delta-lags-percentile_0.1_150_03-04_G_Base_0.0.6"
config["experiment"]["experiment_folder"] = experiment_folder
config["experiment"]["experiments_path"] = experiment_path

_ = prob_to_sends(config["experiment"], preds, 10000, f"hpscaled_factors")

