In [None]:
import os
import csv
import time
import joblib
import pandas as pd
import numpy as np
from flaml import AutoML
import lightgbm as lgb
from flaml.default import preprocess_and_suggest_hyperparams
import logging
from dmeyf2025.processors.feature_processors import CleanZerosTransformer, DeltaLagTransformer, PercentileTransformer, PeriodStatsTransformer, TendencyTransformer, IntraMonthTransformer, RandomForestFeaturesTransformer
from dmeyf2025.metrics.revenue import GANANCIA_ACIERTO, COSTO_ESTIMULO
"""import scipy.stats as stats
if not hasattr(stats, 'binom_test'):
    stats.binom_test = stats.binomtest  # parche compatibilidad
    np.NaN = np.nan"""
from BorutaShap import BorutaShap

logger = logging.getLogger(__name__)
debug_mode = False
sampling_rate = 0.25


  from .autonotebook import tqdm as notebook_tqdm


# Read data

In [2]:
df = pd.read_csv('data/competencia_01_target.csv')
df = df.drop(columns=["mprestamos_personales", "cprestamos_personales"])
weight = {"BAJA+1": 1, "BAJA+2": 1.00002, "CONTINUA": 1}
df["target"] = ((df["clase_ternaria"] == "BAJA+2") | (df["clase_ternaria"] == "BAJA+1")).astype(int)

training_months = [202101, 202102, 202103]
eval_month = 202104
test_month = 202106
seeds = [537919, 923347, 173629, 419351, 287887, 1244, 24341, 1241, 4512, 6554, 62325, 6525235, 14, 4521, 474574, 74543, 32462, 12455, 5124, 55678]
if debug_mode:
    # Sample 0.5% of target=0 cases per month, keep all target=1 rows
    df_list = []
    for mes, df_mes in df[df["target"] == 0].groupby("foto_mes"):
        df_sample = df_mes.sample(frac=0.005, random_state=42)
        df_list.append(df_sample)
    df_target_0_sampled = pd.concat(df_list, axis=0)
    df_target_1 = df[df["target"] == 1]
    df = pd.concat([df_target_0_sampled, df_target_1], axis=0).reset_index(drop=True)
    seeds = [42]


# Extra functions

In [3]:
def gan_eval(y_pred, weight, window=2001):
    """
    Evalúa la ganancia máxima usando una media móvil centrada con ventana de tamaño `window`.
    Retorna el mejor valor encontrado.
    """
    ganancia = np.where(weight == 1.00002, GANANCIA_ACIERTO, 0) - np.where(weight < 1.00002, COSTO_ESTIMULO, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)
    sends = np.argmax(ganancia)
    opt_sends = np.argmax(ganancia)
    if opt_sends - (window-1)/2 < 0:
        min_sends = 0
    else:
        min_sends = int(opt_sends - (window-1)/2)
    if opt_sends + (window-1)/2 > len(ganancia):
        max_sends = len(ganancia)
    else:
        max_sends = int(opt_sends + (window-1)/2)
    
    mean_ganancia = np.mean(ganancia[min_sends:max_sends])
    # Calcula la media móvil centrada con la ventana especificada
    ventana = window
    pad = ventana // 2
    ganancia_padded = np.pad(ganancia, (pad, ventana - pad - 1), mode='edge')
    # Calcula la media móvil centrada
    medias_moviles = np.convolve(ganancia_padded, np.ones(ventana)/ventana, mode='valid')


    # Obtiene el máximo de la media móvil centrada
    mejor_ganancia = np.max(medias_moviles)
    return mejor_ganancia, mean_ganancia
def gan(X_val,
    y_val,
    estimator,
    labels,
    X_train,
    y_train,
    weight_val=None,
    weight_train=None,
    *args,
):  
    y_pred = estimator.predict_proba(X_train)
    ganancia_train, g_mean_train = gan_eval(y_pred, weight_train)
    y_pred = estimator.predict(X_val)
    ganancia_val, g_mean_val = gan_eval(y_pred, weight_val)
    return -ganancia_val, {"ganancia_val": ganancia_val, "ganancia_train":ganancia_train, "g_mean_train":g_mean_train, "g_mean_val":g_mean_val}

In [4]:
def zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval, save_model=True):
    automl = AutoML()
    # Entrenamiento
    print(f"Iniciando experimento {experiment_name}...")
    gains = []
    times = []
    for seed in seeds:
        training_start_time = time.time()
        settings["seed"] = seed
        (
        hyperparams,
        estimator_class,
        X_transformed,
        y_transformed,
        feature_transformer,
        label_transformer,
        ) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm")
        model = estimator_class(**hyperparams, seed = seed)  # estimator_class is lightgbm.LGBMClassifier

        model.fit(X_transformed, y_train)  # LGBMClassifier can handle raw labels
        X_val = feature_transformer.transform(X_eval)  # preprocess test data
        y_pred = model.predict_proba(X_val)[:,1]
        rev = gan_eval(y_pred, w_eval, window=100)
        training_end_time = time.time()
        training_time = training_end_time - training_start_time
        print(f"Seed: {seed}")
        print("Ganancia:", rev, "Tiempo de entrenamiento:", training_time)
        gains.append(rev)
        times.append(training_time)
        # Prepare row data
        result_row = {
            "experiment_name": experiment_name,
            "gain": rev,
            "seed": seed,
            "training_time": training_time,
            "hyperparameters": repr(model.get_params())
        }

        write_header = not os.path.exists(results_file)
        with open(results_file, "a", newline="") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if write_header:
                writer.writeheader()
            writer.writerow(result_row)
        if save_model:
            joblib.dump(model, f"models/{experiment_name}_{seed}.pkl")
            save_model = False
    return np.mean(gains), np.sum(times)

In [10]:
from dmeyf2025.processors.sampler import SamplerProcessor

def prepare_data(df, training_months, eval_month, test_month, get_features):
    df["label"] = ((df["clase_ternaria"] == "BAJA+2") | (df["clase_ternaria"] == "BAJA+1")).astype(int)
    df["weight"] = np.array([weight[item] for item in df["clase_ternaria"]])
    df = df.drop(columns=["clase_ternaria"])
    df_transformed = get_features(df, training_months)
    df_train = df_transformed[df_transformed["foto_mes"].isin(training_months)]
    df_eval = df_transformed[df_transformed["foto_mes"] == eval_month]
    df_test = df_transformed[df_transformed["foto_mes"] == test_month]

    y_eval = df_eval["label"]
    w_eval = df_eval["weight"]
    X_eval = df_eval.drop(columns=["label", "weight"])


    y_test = df_test["label"]
    w_test = df_test["weight"]
    X_test = df_test.drop(columns=["label", "weight"])

    y_train = df_train["label"]
    X_train = df_train.drop(columns=["label"])
    X_train, y_train = SamplerProcessor(sampling_rate).fit_transform(X_train, y_train)
    w_train = X_train["weight"]
    X_train = X_train.drop(columns=["weight"])
    return X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test

In [11]:

settings = {
    "time_budget": None,               # segundos
    "max_iter": 0,
    "starting_points": "data",        # Arrancamos con zero-shot
    "metric": gan,                    # métrica custom
    "task": "classification",         # binaria
    "estimator_list": ["lgbm"],
    "log_file_name": "zero-shot.log",
    "eval_method": "holdout",         
    "verbose": 1,
    "n_jobs": -1,
}

results_file = "results.csv"
fieldnames = ["experiment_name", "gain", "seed", "training_time", "hyperparameters"]

# Baseline

- Sacar prestamos personales
- Lags y Delta Lags de orden 2

In [12]:
experiment_name = "zero_shot_baseline"
def get_features(X, training_months):
    logger.info("Iniciando delta lag transformer...")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight", "clase_ternaria"])
    X_transformed = delta_lag_transformer.fit_transform(X)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")
    return X_transformed

In [13]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [14]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")

Iniciando experimento zero_shot_baseline...
Seed: 42
Ganancia: (np.float64(861326000.0), np.float64(852016969.6969697)) Tiempo de entrenamiento: 24.769181966781616
Ganancia promedio: 856671484.8484849, Tiempo total: 24.769181966781616


# Zero-Clean

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Lags y Delta Lags de orden 2


In [15]:
experiment_name = "zero_shot_zero_clean"

In [16]:
def get_features(X, training_months):
    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    logger.info("Iniciando delta lag transformer...")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")
    return X_transformed

In [17]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [18]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")

Iniciando experimento zero_shot_zero_clean...


KeyboardInterrupt: 

# Percentiles 5

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Lags y Delta Lags de orden 2
- Percentiles discretizados en saltos de 5



In [None]:
experiment_name = "zero_shot_percentiles_5"

In [None]:
def get_features(X, training_months):

    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    logger.info("Iniciando delta lag transformer...")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")

    # Percentiles discretizados en saltos de 5
    percentiles_transformer = PercentileTransformer(n_bins=5)
    X_transformed = percentiles_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de percentiles transformer: {len(X_transformed.columns)}")
    return X_transformed

In [None]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [None]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")
ganancia_intra_month_5 = mean_rev

Iniciando experimento zero_shot_percentiles_5...
Seed: 42
Ganancia: (np.float64(846127000.0), np.float64(845755774.6478873)) Tiempo de entrenamiento: 35.675193786621094
Ganancia promedio: 845941387.3239436, Tiempo total: 35.675193786621094


# Percentiles 1

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Lags y Delta Lags de orden 2
- Percentiles discretizados en saltos de 1



In [None]:
experiment_name = "zero_shot_percentiles_1"

In [None]:
def get_features(X, training_months):

    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    logger.info("Iniciando delta lag transformer...")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")

    # Percentiles discretizados en saltos de 1
    percentiles_transformer = PercentileTransformer(n_bins=1)
    X_transformed = percentiles_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de percentiles transformer: {len(X_transformed.columns)}")
    return X_transformed

In [None]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [None]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")
ganancia_intra_month_1 = mean_rev

Iniciando experimento zero_shot_percentiles_1...
Seed: 42
Ganancia: (np.float64(846103000.0), np.float64(845781194.0298507)) Tiempo de entrenamiento: 37.018494844436646
Ganancia promedio: 845942097.0149254, Tiempo total: 37.018494844436646


# Intra Month F.E

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Feature engineering intra mes
- Lags y Delta Lags de orden 2
- Percentiles discretizados en saltos de 1 o 5, el que de mejores resultados




In [None]:
experiment_name = "zero_shot_intra_month_fe"
n_bins = 5 if ganancia_intra_month_5 > ganancia_intra_month_1 else 1

In [None]:
def get_features(X, training_months):

    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    logger.info("Iniciando delta lag transformer...")
    intra_month_transformer = IntraMonthTransformer()
    X_transformed = intra_month_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de intra month transformer: {len(X_transformed.columns)}")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")

    # Percentiles discretizados en saltos de 5
    percentiles_transformer = PercentileTransformer(n_bins=n_bins)
    X_transformed = percentiles_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de percentiles transformer: {len(X_transformed.columns)}")
    return X_transformed

In [None]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [None]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")

Iniciando experimento zero_shot_intra_month_fe...
Seed: 42
Ganancia: (np.float64(846055000.0), np.float64(845517027.027027)) Tiempo de entrenamiento: 36.579959869384766
Ganancia promedio: 845786013.5135136, Tiempo total: 36.579959869384766


# Historical

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Feature engineering intra mes
- Lags y Delta Lags de orden 2
- Tendencias
- Stats de periodos
- Percentiles discretizados en saltos de 1 o 5, el que de mejores resultados




In [None]:
experiment_name = "zero_shot_historical"
n_bins = 5 if ganancia_intra_month_5 > ganancia_intra_month_1 else 1

In [None]:
def get_features(X, training_months):
    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    initial_columns = X_transformed.columns
    logger.info("Iniciando delta lag transformer...")
    intra_month_transformer = IntraMonthTransformer()
    X_transformed = intra_month_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de intra month transformer: {len(X_transformed.columns)}")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])

    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")
    logger.info("Iniciando tendency transformer...")
    new_columns = set(X_transformed.columns) - set(initial_columns)
    tendency_transformer = TendencyTransformer(exclude_cols=["foto_mes", "numero_de_cliente", "target", "label", "weight"] + list(new_columns))
    X_transformed = tendency_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - set(initial_columns)

    logger.info(f"Cantidad de features después de tendency transformer: {len(X_transformed.columns)}")

    logger.info("Iniciando period stats transformer...")
    period_stats_transformer = PeriodStatsTransformer(periods=[2, 3], exclude_cols=list(new_columns) + ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = period_stats_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - set(initial_columns)
    logger.info(f"Cantidad de features después de period stats transformer: {len(X_transformed.columns)}")
    # Percentiles discretizados en saltos de 5
    percentiles_transformer = PercentileTransformer(n_bins=n_bins)
    X_transformed = percentiles_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de percentiles transformer: {len(X_transformed.columns)}")
    return X_transformed

In [None]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [None]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")

Iniciando experimento zero_shot_historical...
Seed: 42
Ganancia: (np.float64(846063000.0), np.float64(845657567.5675676)) Tiempo de entrenamiento: 65.5582070350647
Ganancia promedio: 845860283.7837838, Tiempo total: 65.5582070350647


# Random Forest Features

- Sacar prestamos personales
- Pasar ceros a Nan en los casos que corresponda
- Feature engineering intra mes
- Lags y Delta Lags de orden 2
- Tendencias
- Stats de periodos
- Percentiles discretizados en saltos de 1 o 5, el que de mejores resultados
- Random Forest Features




In [None]:
experiment_name = "zero_shot_random_forest_features"
n_bins = 5 if ganancia_intra_month_5 > ganancia_intra_month_1 else 1

In [None]:
def get_features(X, training_months):
    clean_zeros_transformer = CleanZerosTransformer()
    X_transformed = clean_zeros_transformer.fit_transform(X)
    initial_columns = X_transformed.columns
    logger.info("Iniciando delta lag transformer...")
    intra_month_transformer = IntraMonthTransformer()
    X_transformed = intra_month_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de intra month transformer: {len(X_transformed.columns)}")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols= ["foto_mes", "numero_de_cliente", "target", "label", "weight"])

    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")
    logger.info("Iniciando tendency transformer...")
    new_columns = set(X_transformed.columns) - set(initial_columns)
    tendency_transformer = TendencyTransformer(exclude_cols=["foto_mes", "numero_de_cliente", "target", "label", "weight"] + list(new_columns))
    X_transformed = tendency_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - set(initial_columns)

    logger.info(f"Cantidad de features después de tendency transformer: {len(X_transformed.columns)}")

    logger.info("Iniciando period stats transformer...")
    period_stats_transformer = PeriodStatsTransformer(periods=[2, 3], exclude_cols=list(new_columns) + ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = period_stats_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - set(initial_columns)
    logger.info(f"Cantidad de features después de period stats transformer: {len(X_transformed.columns)}")
    # Percentiles discretizados en saltos de 5
    percentiles_transformer = PercentileTransformer(n_bins=n_bins)
    X_transformed = percentiles_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de percentiles transformer: {len(X_transformed.columns)}")
    
    logger.info("Iniciando RandomForest Feature Transformer...")
    random_forest_features_transformer = RandomForestFeaturesTransformer(training_months= training_months)  
    X_transformed = random_forest_features_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features después de RandomForest Feature Transformer: {len(X_transformed.columns)}")
    return X_transformed

In [None]:
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_test, y_test = prepare_data(df, training_months, eval_month, test_month, get_features)

In [None]:
mean_rev, total_time = zero_shot_experiment(experiment_name, seeds, results_file, fieldnames, settings, X_train, y_train, w_train, X_eval, y_eval, w_eval)
print(f"Ganancia promedio: {mean_rev}, Tiempo total: {total_time}")

Iniciando experimento zero_shot_random_forest_features...
Seed: 42
Ganancia: (np.float64(861346000.0), np.float64(853895555.5555556)) Tiempo de entrenamiento: 73.48208999633789
Ganancia promedio: 857620777.7777778, Tiempo total: 73.48208999633789
