## Precarga de librerias y funciones

In [1]:
%%capture
%run "../recurrentes.ipynb"

In [2]:
%%capture
%run "../funciones.ipynb"

In [3]:
# 1 configuración de la base de datos

%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [4]:
# 2 Preseteo optuna, especificar bucket y nombre del estudio

nombre_archivo = 'optimization_tree.db'
bucket = 'b1/'
estudio_optuna = save_path #definido en recurrentes.ipynb
nombre_estudio = 'competencia_03_final'


In [6]:
# 3 - Cargar datos

# Opciones:

# a) dataset_clase_ternaria

df_train = pd.read_parquet(dataset_competencia_03_lags_deltas_y_clase_ternaria_l)



In [7]:
lista_columnas = df_train.columns
listas_drop = [col for col in lista_columnas if 'lag' in col]
df_train = df_train.drop(columns=listas_drop)
lista_drop_meses = [201901, 201902, 201903, 201905, 201907, 201911, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107,202108, 202109]
df_train = df_train[df_train['foto_mes'].isin(lista_drop_meses)]
df_train.shape
df_train['foto_mes'].unique()

In [10]:
columnas_num = df_train.select_dtypes(exclude=['bool','object'])
lista_columnas_num = columnas_num.columns

df_train['max'] = df_train[lista_columnas_num].max(axis=1)

In [11]:
variables = [
    'cmobile_app_trx', 'Master_cconsumos', 'ctarjeta_debito_transacciones',
    'ctarjeta_visa_transacciones', 'ctarjeta_master_transacciones', 'cpayroll_trx',
    'cpayroll2_trx', 'ccuenta_debitos_automaticos', 'cpagodeservicios', 'cforex',
    'cforex_buy', 'mforex_sell', 'cextraccion_autoservicio',
    'ccallcenter_transacciones', 'chomebanking_transacciones'
]
if 'ctrx_quarter' not in df_train.columns:
    raise KeyError("La variable 'ctrx_quarter' no existe en el dataset.")
variables_presentes = [var for var in variables if var in df_train.columns]
if variables_presentes:
    for var in variables_presentes:
        denominator = df_train[var].replace(0, 1e-6) ** 2
        df_train['power_ratio_' + var] = df_train['ctrx_quarter'] / denominator
    variables_faltantes = [var for var in variables if var not in df_train.columns]
    if variables_faltantes:
        print(f"Las siguientes variables no existen en el dataset y no se calcularon: {variables_faltantes}")
else:
    print("Ninguna de las variables especificadas existe en el dataset.")

## Formateo pre optimización

In [13]:
if 'T_Visa_normal' in df_train.columns:
    df_train['T_Visa_normal'] = df_train['T_Visa_normal'].astype(bool)
if 'T_Master_normal'in df_train.columns:
    df_train['T_Master_normal'] = df_train['T_Master_normal'].astype(bool)


In [14]:
# 4 - parametros para modelo
ganancia_acierto = 273000
costo_estimulo = 7000
mes_train = [201901, 201902, 201903, 201905, 201907, 201911, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107]
mes_test = 202109

In [15]:
data = df_train

In [16]:
data['tmobile_app'] = pd.to_numeric(data['tmobile_app'], errors='coerce')  # Convierte no numéricos a NaN
data['tmobile_app'] = data['tmobile_app'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano



data['cmobile_app_trx'] = pd.to_numeric(data['cmobile_app_trx'], errors='coerce')  # Convierte no numéricos a NaN
data['cmobile_app_trx'] = data['cmobile_app_trx'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano

In [17]:

data['clase_peso'] = 1.0
data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001
data['clase_binaria'] = np.where(data['clase_ternaria']=='BAJA+2', 1, 0)
df_test = data[data['foto_mes'] == mes_test]
df_train = data[data['foto_mes'].isin(mes_train)]

clase_peso = df_train['clase_peso']
X_train = df_train.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_train =df_train['clase_binaria']
X_test = df_test.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_test =df_test['clase_binaria']
w_train = df_train.loc[X_train.index, 'clase_peso']


In [18]:
del data

## Optimización

In [19]:
optuna.logging.set_verbosity(optuna.logging.DEBUG)

In [20]:
import optuna
import logging
import sys
import lightgbm as lgb

# Configurar el logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
optuna.logging.set_verbosity(optuna.logging.DEBUG)


def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 200, 10000)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3)  # mas bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 8000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.3, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.3, 1.0)

    params = {
        'objective': 'binary',
        'metric': 'custom', 
        'boosting_type': 'gbdt',
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,        
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }

    train_data = lgb.Dataset(X_train,
                             label=Y_train,  # elegir la clase
                             weight=w_train)
                             
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=1000,  # modificar, subir y subir... y descomentar la línea inferior
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
        
    )

    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cuál es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    # Imprimir información de la prueba actual
    

    return max_gan * 5

# Ajuste de la verbosidad de Optuna ya incluido arriba
# optuna.logging.set_verbosity(optuna.logging.INFO)

storage_name = rf"sqlite:///{estudio_optuna}"

study_name = nombre_estudio

# Crear el callback con 300 pruebas consecutivas sin mejora

# Crear el estudio
study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

# Optimizar el estudio con el callback personalizado
study.optimize(objective, n_trials=1000, show_progress_bar=True)


  0%|          | 0/1000 [00:00<?, ?it/s]

[W 2024-12-04 16:30:15,502] Trial 8 failed with parameters: {'num_leaves': 8246, 'learning_rate': 0.2862039982913327, 'min_data_in_leaf': 7731, 'feature_fraction': 0.6025938649151114, 'bagging_fraction': 0.5376326865495187} because of the following error: LightGBMError('Check failed: (best_split_info.left_count) > (0) at D:\\a\\1\\s\\lightgbm-python\\src\\treelearner\\serial_tree_learner.cpp, line 846 .\n').
Traceback (most recent call last):
  File "c:\Users\Admin\.conda\envs\datascience\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_12208\2862446120.py", line 44, in objective
    cv_results = lgb.cv(
                 ^^^^^^^
  File "c:\Users\Admin\.conda\envs\datascience\Lib\site-packages\lightgbm\engine.py", line 826, in cv
    cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\.conda\envs

LightGBMError: Check failed: (best_split_info.left_count) > (0) at D:\a\1\s\lightgbm-python\src\treelearner\serial_tree_learner.cpp, line 846 .


In [None]:
best_iter =  study.best_trial.user_attrs["best_iter"]
best_iter

104

In [None]:
best_params = study.best_params
best_params

{'num_leaves': 7403,
 'learning_rate': 0.04206182379229533,
 'min_data_in_leaf': 191,
 'feature_fraction': 0.4078072640925111,
 'bagging_fraction': 0.300397113625344}