In [1]:
import logging
import os
import csv
import joblib
import psutil
import gc
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from dmeyf2025.processors.feature_processors import CleanZerosTransformer, DeltaLagTransformer, PercentileTransformer, PeriodStatsTransformer, TendencyTransformer, IntraMonthTransformer, RandomForestFeaturesTransformer, DatesTransformer, HistoricalFeaturesTransformer, AddCanaritos

from dmeyf2025.metrics.revenue import gan_eval
from dmeyf2025.etl.etl import prepare_data
pd.set_option('display.max_columns', None)
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()  # Para mostrar en consola
    ]
)



In [3]:
# Algunos settings
VERBOSE = False
experiment_name = "zlgbm-baseline"
training_months = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
       201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
       202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
       202101, 202102, 202103, 202104]
save_model = True
eval_month = 202106
test_month = 202108
seeds = [537919, 923347, 173629, 419351, 287887, 1244, 24341, 1241, 4512, 6554, 62325, 6525235, 14, 4521, 474574, 74543, 32462, 12455, 5124, 55678]
debug_mode = False
sampling_rate = 0.05
results_file = "/home/martin232009/buckets/b1/results.csv"
fieldnames = ["experiment_name", "seed", "training_time", "moving_average_rev"]
logging.info("comenzando")
features_to_drop = ["cprestamos_prendarios", "mprestamos_prendarios", "cprestamos_personales", "mprestamos_personales"]
canaritos = 10
gradient_bound = 0.01
n_seeds = 5
params = {
    "canaritos": canaritos,
    "gradient_bound": gradient_bound,
    "feature_fraction": 0.50,
    "min_data_in_leaf": 20,
}

experiment_name = f"{experiment_name}_c{canaritos}_gb{experiment_name}_s{sampling_rate}"


2025-11-15 02:30:17,315 - root - INFO - comenzando


In [4]:
def memory_gb(df: pd.DataFrame) -> float:
    return df.memory_usage().sum() / (1024 ** 3)

def apply_transformer(transformer, X, name: str, logger):
    logger.info(f"[{name}] Iniciando…")

    start_mem = memory_gb(X)
    start_time = time.time()

    Xt = transformer.fit_transform(X)

    end_time = time.time()
    end_mem = memory_gb(Xt)

    n_rows, n_cols = Xt.shape

    logger.info(
        f"[{name}] Tiempo: {end_time - start_time:.2f}s | "
        f"Memoria antes: {start_mem:.3f} GB | "
        f"Memoria después: {end_mem:.3f} GB | "
        f"Diferencia: {end_mem - start_mem:+.3f} GB | "
        f"Shape: {n_rows:,} filas × {n_cols:,} columnas"
    )
    if VERBOSE:
        display(Xt.head())
        display(Xt.describe())
        logger.info(f"Nulos: {Xt.isna().astype(int).sum()}")
    gc.collect()
    return Xt


def get_features(X, training_months):

    X_transformed = X

    X_transformed = apply_transformer(
        CleanZerosTransformer(),
        X_transformed,
        "CleanZerosTransformer",
        logger
    )

    X_transformed = apply_transformer(
        DeltaLagTransformer(
            n_lags=2,
            exclude_cols=["foto_mes","numero_de_cliente","target","label","weight","clase_ternaria"]
        ),
        X_transformed,
        "DeltaLagTransformer",
        logger
    )
    logger.info(f"Cantidad de features después de delta lag transformer: {len(X_transformed.columns)}")

    X_transformed = apply_transformer(
        PercentileTransformer(
            replace_original=True
        ),
        X_transformed,
        "PercentileTransformer",
        logger
    )

    return X_transformed



def train_model(train_set, params):
    """
    Entrena un modelo ZuperLightGBM (lgbm)
    Args:
        X_train (pd.DataFrame): Features de entrenamiento
        y_train (pd.Series): Variable objetivo de entrenamiento
        w_train (pd.Series): Weights
        params (dict): diccionario que debe tener:
            - 'semilla_primigenia'
            - 'min_data_in_leaf'
            - 'learning_rate'
            - 'canaritos': maneja el overfitting mediante canaritos, cuando detecta un árbol cuyo primer split es un canarito lo mata.
            - 'gradient_bound': bound para el gradiente es algo asi como un learning rate que va cambiando a medida que se va entrenando???.
    """
    lgb_params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "None",        # Para usar métrica custom
        "first_metric_only": False,
        "boost_from_average": True,
        "feature_pre_filter": False,
        "force_row_wise": True,
        "verbosity": -100,
        "seed": params["seed"],

        "max_bin": 31,
        "min_data_in_leaf": params["min_data_in_leaf"],

        "num_iterations": 9999,
        "num_leaves": 9999,
        "learning_rate": 1,

        "feature_fraction": params["feature_fraction"],

        # Hiperparámetros del Zuperlightgbm
        "canaritos": params["canaritos"],
        "gradient_bound": params["gradient_bound"],  
    }

    
    gbm = lgb.train(
        lgb_params,
        train_set
    )
    return gbm



In [5]:
# Leer datos
logger.info("Leyendo dataset")
df = pd.read_csv('~/datasets/competencia_02_target.csv')
# Eliminar features que no se van a usar
keep_cols = [col for col in df.columns if col not in features_to_drop]
df = df[keep_cols]
df = df[~df["foto_mes"].isna()]
# Agregar target y calcular weight
weight = {"BAJA+1": 1, "BAJA+2": 1.00002, "CONTINUA": 1}
df["target"] = ((df["clase_ternaria"] == "BAJA+2") | (df["clase_ternaria"] == "BAJA+1")).astype(int)




2025-11-15 02:30:40,241 - __main__ - INFO - Leyendo dataset


In [16]:
from dmeyf2025.processors.feature_processors import BaseTransformer
from dmeyf2025.utils.data_dict import ALL_CAT_COLS, EXCLUDE_COLS
class TendencyTransformer(BaseTransformer):
    """
    Calcula la pendiente de regresión lineal de cada variable numérica para cada cliente usando una ventana de 6 meses.
    """
    def __init__(self, exclude_cols=["foto_mes", "numero_de_cliente", "target", "label", "weight"]):
        self.exclude_cols = exclude_cols
        self.numeric_cols_ = None

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X debe ser un pandas DataFrame")

        self.numeric_cols_ = [
            col for col in X.columns
            if col not in self.exclude_cols and col not in ALL_CAT_COLS and col.startswith('m')
        ]
        return self

    def _transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X debe ser un pandas DataFrame")
    
        X = X.sort_values(['numero_de_cliente', 'foto_mes'])
        clientes = X['numero_de_cliente'].values
        new_cols = {}
    
        # identificar bloques contiguos por cliente
        _, start_idx, counts = np.unique(clientes, return_index=True, return_counts=True)
    
        window = 6  # ventana de 6 meses
    
        for col in self.numeric_cols_:
            y_all = X[col].values.astype(float)
            slope = np.full_like(y_all, np.nan, dtype=float)
    
            for s, n in zip(start_idx, counts):
                y = y_all[s : s + n]
    
                for i in range(n):
                    # índices de ventana
                    start = max(0, i - window + 1)
                    y_win = y[start : i + 1]
    
                    mask = np.isfinite(y_win)
                    if mask.sum() < 2:
                        continue
    
                    # regresión en la ventana
                    y_valid = y_win[mask]
                    x_valid = np.arange(len(y_win))[mask]
    
                    # pendiente
                    cov = np.cov(x_valid, y_valid, bias=True)
                    var_x = cov[0, 0]
    
                    if var_x == 0:
                        sl = np.nan
                    else:
                        sl = cov[0, 1] / var_x
    
                    slope[s + i] = sl
    
            new_cols[f"{col}_tendency_6m"] = slope
    
        X_out = X.assign(**new_cols)
        return X_out


In [17]:

X_transformed = apply_transformer(
        TendencyTransformer(),
        df.drop(columns=["clase_ternaria"]),
        "PeriodStats",
        logger
    )

2025-11-15 04:26:11,868 - __main__ - INFO - [PeriodStats] Iniciando…


KeyboardInterrupt: 

In [None]:
X_transformed.describe()