In [1]:
from dmeyf2025.modelling.train_model import train_ensemble_model
from dmeyf2025.experiments.experiments import load_config
from dmeyf2025.etl import ETL
from dmeyf2025.processors.target_processor import BinaryTargetProcessor, CreateTargetProcessor
from dmeyf2025.processors.sampler import SamplerProcessor
from dmeyf2025.processors.feature_processors import DeltaLagTransformer
from dmeyf2025.metrics.revenue import lgb_gan_eval
import json
import os
import numpy as np

# Config

In [2]:
experiment_path = "experiments/DeltaLags2_0.3_150_01-04_G_0.0.1"
params = json.load(open(os.path.join(experiment_path, "best_params.json")))
config = load_config(os.path.join(experiment_path, "config.yaml"))



In [3]:
train_months = config["data"]["train_months"]
eval_month = config["data"]["eval_month"]
positive_class = config["experiment"]["positive_classes"]
seeds = config["experiment"]["seeds"]
data_path = config["experiment"]["raw_data_path"]
params.pop("seed")
sample_ratio = config["experiment"]["sample_ratio"]
test_months = 202105


# Processing

In [4]:
etl = ETL(data_path, CreateTargetProcessor(),
train_months = [202101, 202102, 202103, 202104, 202105, 202106],)
X, y, _,_,_,_ = etl.execute_complete_pipeline()
target_processor = BinaryTargetProcessor(positive_class)
X, y = target_processor.fit_transform(X, y)

In [5]:
delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2)
X_transformed = delta_lag_transformer.fit_transform(X)
print(f"X_train_transformed.shape: {X_transformed.shape}")


X_train_transformed.shape: (978439, 714)


In [6]:
X_transformed.set_index("numero_de_cliente", inplace=True)
X_transformed.loc[:, "label"] = y


  X_transformed.loc[:, "label"] = y


In [7]:
X_train = X_transformed[X_transformed["foto_mes"].isin(train_months)]
y_train = X_train["label"]
X_train = X_train.drop(columns=["label"])
X_eval = X_transformed[X_transformed["foto_mes"].isin([eval_month])]
X_eval = X_eval.drop(columns=["label"])
X_test = X_transformed[X_transformed["foto_mes"].isin([202105])] # Solo para ver el escalado


In [8]:
#Reescalado de hiperparámetros
#params["min_data_in_leaf"] = int(params["min_data_in_leaf"]/sample_ratio)

In [9]:
"""sampler_processor = SamplerProcessor(experiment_config['SAMPLE_RATIO'])
    X_train_sampled, y_train_sampled = sampler_processor.fit_transform(X_train, y_train)"""

"sampler_processor = SamplerProcessor(experiment_config['SAMPLE_RATIO'])\n    X_train_sampled, y_train_sampled = sampler_processor.fit_transform(X_train, y_train)"

# Train

In [10]:
preds, models= train_ensemble_model(X_train, y_train, X_eval, params, seeds, experiment_path)

In [11]:
import numpy as np
ones = np.ones(10000, dtype=int)
zeros = np.zeros(len(preds)-10000, dtype=int)
predictions = np.concatenate([ones, zeros])



In [None]:
ones = np.ones(10000, dtype=int)
zeros = np.zeros(len(preds)-10000, dtype=int)
predictions = np.concatenate([ones, zeros])
preds["predicted"] = predictions
preds[["numero_de_cliente", "predicted"]].to_csv(f"{experiment_path}/DeltaLags2_0.3_150_01-04_G_0.0.2_ensemble_10k_predictions.csv", index=False)

In [13]:
preds[["numero_de_cliente", "predicted"]].shape

(164313, 2)

# Escalado de X_eval

In [14]:
from dmeyf2025.utils.data_dict import FINANCIAL_COLS
etl = ETL(data_path, CreateTargetProcessor(),
train_months = [202101, 202102, 202103, 202104, 202105, 202106],)
X, y, _,_,_,_ = etl.execute_complete_pipeline()
target_processor = BinaryTargetProcessor(positive_class)
X, y = target_processor.fit_transform(X, y)

X_eval = X[X["foto_mes"].isin([eval_month])]
X_test = X[X["foto_mes"].isin([202105])] # Solo para ver el escalado


In [15]:
cols_mayores_145 = (X_eval[FINANCIAL_COLS].mean() / X_test[FINANCIAL_COLS].mean()).sort_values()
cols_mayores_145 = cols_mayores_145[cols_mayores_145 > 1.45].index.tolist()

In [16]:
ratio_df = (
    X_eval[["numero_de_cliente"] + cols_mayores_145]
    .set_index("numero_de_cliente")
    .divide(
        X_test[["numero_de_cliente"] + cols_mayores_145]
        .set_index("numero_de_cliente"),
        fill_value=np.nan
    )
)
ratio_df.reset_index(inplace=True)
scaling_clients = ratio_df["numero_de_cliente"]

In [17]:
X.loc[X["numero_de_cliente"].isin(scaling_clients) & (X["foto_mes"] == 202106), cols_mayores_145] = X.loc[X["numero_de_cliente"].isin(scaling_clients) & (X["foto_mes"] == 202106), cols_mayores_145] / 1.3

In [18]:
delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2)
X_transformed = delta_lag_transformer.fit_transform(X)
print(f"X_train_transformed.shape: {X_transformed.shape}")

X_train_transformed.shape: (978439, 714)


In [19]:
X_transformed.set_index("numero_de_cliente", inplace=True)
X_transformed.loc[:, "label"] = y


  X_transformed.loc[:, "label"] = y


In [20]:
X_train = X_transformed[X_transformed["foto_mes"].isin(train_months)]
y_train = X_train["label"]
X_train = X_train.drop(columns=["label"])
X_eval = X_transformed[X_transformed["foto_mes"].isin([eval_month])]
X_eval = X_eval.drop(columns=["label"])
X_test = X_transformed[X_transformed["foto_mes"].isin([202105])] # Solo para ver el escalado

In [21]:
import pandas as pd
predictions = pd.DataFrame()
for n, model in enumerate(models):
    predictions["numero_de_cliente"] = X_eval.index
    y_pred = model.predict(X_eval)
    predictions[f"pred_{n}"] = y_pred

In [22]:
predictions["pred_ensemble"] = predictions.drop(columns=["numero_de_cliente"]).mean(axis=1)
predictions = predictions.sort_values(by="pred_ensemble", ascending=False)
import numpy as np
ones = np.ones(12000, dtype=int)
zeros = np.zeros(len(preds)-12000, dtype=int)
sends = np.concatenate([ones, zeros])

preds["predicted"] = sends
preds[["numero_de_cliente", "predicted"]].to_csv(f"{experiment_path}/DeltaLags2_0.3_150_01-04_G_0.0.2_ensemble_12k_predictions_dd1.3.csv", index=False)

# optimización de envíos

In [23]:
from dmeyf2025.metrics.revenue import sends_optimization

In [25]:
from dmeyf2025.metrics.revenue import revenue_from_prob
def sends_optimization(y_pred, y_true, min_sends, max_sends, steps=100):
    """
    Función que optimiza la cantidad de envíos para maximizar la ganancia.
    """
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    # Convertir etiquetas binarias (0/1) a formato ternario para ganancia_prob
    y_ternaria = ["CONTINUA" if label == 0 else "BAJA+2" for label in y_true]
    max_ganancia = -np.inf
    # Calcular ganancia usando las probabilidades directamente
    for n_sends in range(min_sends, max_sends, steps):
        ganancia = revenue_from_prob(y_pred, y_ternaria, n_sends)
        if ganancia > max_ganancia:
            max_ganancia = ganancia
            best_n_sends = n_sends
    return best_n_sends

In [31]:
min_sends = 1000
max_sends = 20000
X_train_04 = X_train.copy()
X_train_04["label"] = y_train
X_train_04 = X_train_04[X_train_04["foto_mes"].isin([202104])]
y_train_04 = X_train_04["label"]
X_train_04 = X_train_04.drop(columns=["label"])
for model in models:
    y_pred = model.predict(X_train_04)
    best_sends = sends_optimization(y_pred, y_train_04, min_sends, max_sends)
    print(f"Best sends: {best_sends}")

Best sends: 2800
Best sends: 2800
Best sends: 2800
Best sends: 2800
Best sends: 2600
