In [1]:
import argparse
from datetime import datetime
import logging
import time
import os
import random
import numpy as np
from dmeyf2025.experiments import experiment_init, save_experiment_results
from dmeyf2025.processors.feature_processors import CleanZerosTransformer, DeltaLagTransformer, PercentileTransformer, PeriodStatsTransformer, TendencyTransformer
from dmeyf2025.utils.features_check import check_features
from dmeyf2025.utils.data_dict import FINANCIAL_COLS
from dmeyf2025.utils.wilcoxon import compare_with_best_model
from dmeyf2025.utils.scale_params import scale_params
from dmeyf2025.pipelines import load_data, preprocessing_pipeline, optimization_pipeline, evaluation_pipeline, production_pipeline

FORCE_DEBUG = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    datefmt='%H:%M:%S',
    handlers=[logging.StreamHandler()]
)

logger = logging.getLogger(__name__)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_features(X):
    logger.info(f"Cantidad de features: {len(X.columns)}")
    initial_columns = set(X.columns)

    logger.info("Iniciando clean zeros transformer...")
    clean_zeros_transformer = CleanZerosTransformer(exclude_cols=["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = clean_zeros_transformer.fit_transform(X)

    logger.info("Iniciando tendency transformer...")
    tendency_transformer = TendencyTransformer(exclude_cols=["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = tendency_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - initial_columns

    logger.info(f"Cantidad de features despu√©s de tendency transformer: {len(X_transformed.columns)}")

    logger.info("Iniciando period stats transformer...")
    period_stats_transformer = PeriodStatsTransformer(periods=[2, 3], exclude_cols=list(new_columns) + ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = period_stats_transformer.fit_transform(X_transformed)
    new_columns = set(X_transformed.columns) - initial_columns
    logger.info(f"Cantidad de features despu√©s de period stats transformer: {len(X_transformed.columns)}")

    logger.info("Iniciando delta lag transformer...")
    delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2, exclude_cols=list(new_columns) + ["foto_mes", "numero_de_cliente", "target", "label", "weight"])
    X_transformed = delta_lag_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features despu√©s de delta lag transformer: {len(X_transformed.columns)}")

    logger.info("Iniciando percentile transformer...")
    percentile_transformer = PercentileTransformer(variables=None, replace_original=True)
    X_transformed = percentile_transformer.fit_transform(X_transformed)
    logger.info(f"Cantidad de features despu√©s de percentile transformer: {len(X_transformed.columns)}")
    return X_transformed

In [3]:
# Inicializar experimento
experiment_config = experiment_init("config.yaml", script_file=None, debug=FORCE_DEBUG)

DEBUG = os.getenv('DEBUG_MODE', 'False').lower() == 'true'
date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
seeds = experiment_config["seeds"]

np.random.seed(seeds[0])
random.seed(seeds[0])

# Logging inicial
logger.info(
    f"""\n{'=' * 70}
üìÖ {date_time}
üìù Iniciando experimento: {experiment_config['experiment_name']}
üéØ Descripci√≥n: {experiment_config['config']['experiment']['description']}
üîß Experiment folder: {experiment_config['experiment_folder']}
{'=' * 70}"""
)
start_time = time.time()


18:23:40 | INFO | üöÄ INICIANDO EXPERIMENTO DEBUG_all_features EN MODO DEBUG
18:23:40 | INFO | 
üìÖ 2025-11-02 18:23:40
üìù Iniciando experimento: DEBUG_all_features
üéØ Descripci√≥n: test
üîß Experiment folder: DEBUG_all_features_sr_0.1-t_70-mt_202010_202104-me_202106_1.0.0


In [4]:
X, y = load_data(experiment_config)


18:23:40 | INFO | Iniciando ETL pipeline...
18:23:40 | INFO | Iniciando pipeline ETL completo...
18:23:43 | INFO | Archivo le√≠do exitosamente: 978439 filas, 153 columnas, se eliminaron 2 columnas
18:23:43 | INFO | Se filtraron 978439 filas, 153 columnas
18:23:47 | INFO | Procesamiento completado: 978439 filas, 155 columnas
18:23:47 | INFO | X shape: (978439, 154), y shape: (978439,)
18:23:47 | INFO | Pipeline ETL completado exitosamente!


In [5]:
# Preprocessing Pipeline
X_train, y_train, w_train, X_eval, y_eval, w_eval, X_prod, y_prod, w_prod = preprocessing_pipeline(X, y, experiment_config, get_features)

18:23:47 | INFO | Iniciando preprocessing pipeline...
18:23:47 | INFO | Iniciando procesamiento de features...
18:23:47 | INFO | Cantidad de features: 156
18:23:47 | INFO | Iniciando clean zeros transformer...
18:23:47 | INFO | Iniciando tendency transformer...
18:23:56 | INFO | Cantidad de features despu√©s de tendency transformer: 201
18:23:56 | INFO | Iniciando period stats transformer...
18:23:56 | INFO | Cantidad de features despu√©s de period stats transformer: 561
18:23:56 | INFO | Iniciando delta lag transformer...
18:23:58 | INFO | Cantidad de features despu√©s de delta lag transformer: 1121
18:23:58 | INFO | Iniciando percentile transformer...
18:24:04 | INFO | Cantidad de features despu√©s de percentile transformer: 1121
18:24:04 | INFO | Iniciando split de datos...
18:24:04 | INFO | X_train.shape: (48631, 1119)
18:24:05 | INFO | X_eval.shape: (16382, 1119)
18:24:05 | INFO | X_prod.shape: (16315, 1118)


In [None]:
INFO | X_train.shape: (16382, 1119)
INFO | X_prod.shape: (16315, 1118)
set(X_train.columns) - set(X_prod.columns)
set()

set()

In [19]:
print(X_train.columns.duplicated().sum(), X_train.index.duplicated().sum())
print(X_prod.columns.duplicated().sum(), X_prod.index.duplicated().sum())







0 4586
0 0
