## Precarga de librerias y funciones

In [1]:
%%capture
%run "../../recurrentes.ipynb"

In [2]:
%%capture
%run "../../funciones.ipynb"

In [3]:
# 1 configuración de la base de datos

%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [4]:
# 2 Preseteo optuna especificar bucket y nombre del estudio


nombre_archivo = 'optimization_tree.db'
bucket = 'b2/'


estudio_optuna = base_path + 'buckets/' + bucket + 'optimization_tree.db'


# cargar estudio
# a) competencia_02
# b) competencia_02_lags
# c) competencia_02_lags_y_deltas
# b1 -- competencia_02_lags_deltas_psi_fe
# b1 - 'competencia_02_lags_deltas_psi_fe_sorted'

nombre_estudio = 'competencia_03_solo_deltas_max_div_sin_pandemia'


In [5]:
# 3 - Cargar datos

# Opciones:

# a) dataset_clase_ternaria

df_train = pd.read_parquet(dataset_competencia_03_lags_deltas_y_clase_ternaria)



In [6]:
columnas = df_train.columns
columnas_lags = [col for col in columnas if 'lag' in col]

In [7]:
df_train = df_train.drop(columns=columnas_lags)

In [8]:
df_train.shape

(4901237, 320)

In [9]:
columnas_num = df_train.select_dtypes(exclude=['bool','object'])
lista_columnas_num = columnas_num.columns

In [10]:
df_train['max'] = df_train[lista_columnas_num].max(axis=1)

In [11]:
# Lista de variables a verificar
variables = [
    'cmobile_app_trx', 'Master_cconsumos', 'ctarjeta_debito_transacciones', 
    'ctarjeta_visa_transacciones', 'ctarjeta_master_transacciones', 'cpayroll_trx', 
    'cpayroll2_trx', 'ccuenta_debitos_automaticos', 'cpagodeservicios', 'cforex', 
    'cforex_buy', 'mforex_sell', 'cextraccion_autoservicio', 'ccallcenter_transacciones', 
    'chomebanking_transacciones'
]

# Verificar si 'ctrx_quarter' existe en el DataFrame
if 'ctrx_quarter' not in df_train.columns:
    raise KeyError("La variable 'ctrx_quarter' no existe en el dataset.")

# Lista de las variables que sí están presentes en el DataFrame
variables_presentes = [var for var in variables if var in df_train.columns]

# Si hay variables presentes en el DataFrame, calcular la Power Ratio
if variables_presentes:
    for var in variables_presentes:
        # Evitar división por cero reemplazando ceros en el denominador por un valor mínimo (1e-6)
        denominator = df_train[var].replace(0, 1e-6) ** 2  # Elevar al cuadrado el denominador
        # Crear una nueva columna con el prefijo 'power_ratio_'
        df_train['power_ratio_' + var] = df_train['ctrx_quarter'] / denominator
else:
    print("No se encontraron variables para calcular la Power Ratio.")

# Informar sobre las variables que faltan en el DataFrame
variables_faltantes = [var for var in variables if var not in df_train.columns]
if variables_faltantes:
    print(f"Las siguientes variables no existen en el dataset y no se calcularon: {variables_faltantes}")
else:
    print("Todas las variables fueron verificadas correctamente.")


Todas las variables fueron verificadas correctamente.


In [12]:
# df_train.drop(columns=dicc_psi['dataset_lags_deltas_y_clase_ternaria'], inplace= True)
# df_train.drop(columns=dicc_psi['lista_light_gbm_feature_importance'], inplace = True)


# Light GBM

## Formateo pre modelo

In [13]:
if 'T_Visa_normal' in df_train.columns:
    df_train['T_Visa_normal'] = df_train['T_Visa_normal'].astype(bool)
if 'T_Master_normal'in df_train.columns:
    df_train['T_Master_normal'] = df_train['T_Master_normal'].astype(bool)


In [14]:
# 4 - parametros para modelo
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
       201909, 201910, 201911, 201912, 202010, 202011, 202012,
       202101, 202102, 202103, 202104, 202105]
mes_test = 202108


In [15]:
data = df_train

In [16]:
data['tmobile_app'] = pd.to_numeric(data['tmobile_app'], errors='coerce')  # Convierte no numéricos a NaN
data['tmobile_app'] = data['tmobile_app'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano



data['cmobile_app_trx'] = pd.to_numeric(data['cmobile_app_trx'], errors='coerce')  # Convierte no numéricos a NaN
data['cmobile_app_trx'] = data['cmobile_app_trx'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano

In [18]:

data['clase_peso'] = 1.0
data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001
data['clase_binaria'] = np.where(data['clase_ternaria']=='BAJA+2', 1, 0)
df_test = data[data['foto_mes'] == 202107]
df_train = data[data['foto_mes'].isin(mes_train)]

clase_peso = df_train['clase_peso']
X_train = df_train.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_train =df_train['clase_binaria']
X_test = df_test.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_test =df_test['clase_binaria']
w_train = df_train.loc[X_train.index, 'clase_peso']


### Optimización

#### Optimización original

In [19]:
import optuna

optuna.logging.set_verbosity(optuna.logging.INFO)

In [20]:
import optuna
import logging
import sys
import lightgbm as lgb

# Configurar el logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
optuna.logging.set_verbosity(optuna.logging.DEBUG)


def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 8, 100)
    learning_rate = trial.suggest_float('learning_rate', 0.00005, 0.3)  # más bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 2000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0)

    params = {
        'objective': 'binary',
        'metric': 'custom', 
        'boosting_type': 'gbdt',
        
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }

    train_data = lgb.Dataset(X_train,
                             label=Y_train,  # elegir la clase
                             weight=w_train)
                             
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=300,  # modificar, subir y subir... y descomentar la línea inferior
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
        
    )

    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cuál es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    # Imprimir información de la prueba actual
    

    return max_gan * 5

# Ajuste de la verbosidad de Optuna ya incluido arriba
# optuna.logging.set_verbosity(optuna.logging.INFO)

storage_name = rf"sqlite:///{estudio_optuna}"

study_name = nombre_estudio

# Crear el callback con 300 pruebas consecutivas sin mejora

# Crear el estudio
study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

# Optimizar el estudio con el callback personalizado
study.optimize(objective, n_trials=1000, show_progress_bar=True)


Best trial: 0. Best value: 1.26326e+09:   0%|          | 1/1000 [07:21<122:23:13, 441.03s/it]

[I 2024-12-03 05:45:59,304] Trial 0 finished with value: 1263262000.0 and parameters: {'num_leaves': 94, 'learning_rate': 0.2514941694500914, 'min_data_in_leaf': 1440, 'feature_fraction': 0.5757122440847728, 'bagging_fraction': 0.18795564568827733}. Best is trial 0 with value: 1263262000.0.


Best trial: 1. Best value: 1.34128e+09:   0%|          | 2/1000 [13:02<105:57:46, 382.23s/it]

[I 2024-12-03 05:51:40,374] Trial 1 finished with value: 1341284000.0 and parameters: {'num_leaves': 37, 'learning_rate': 0.2044811182306093, 'min_data_in_leaf': 1704, 'feature_fraction': 0.22051024290136415, 'bagging_fraction': 0.8277652429340113}. Best is trial 1 with value: 1341284000.0.


Best trial: 2. Best value: 1.35772e+09:   0%|          | 3/1000 [19:14<104:40:32, 377.97s/it]

[I 2024-12-03 05:57:53,265] Trial 2 finished with value: 1357720000.0 and parameters: {'num_leaves': 27, 'learning_rate': 0.09455722012150179, 'min_data_in_leaf': 110, 'feature_fraction': 0.3320627460201131, 'bagging_fraction': 0.7186269501843217}. Best is trial 2 with value: 1357720000.0.


Best trial: 3. Best value: 1.43373e+09:   0%|          | 4/1000 [23:08<88:44:33, 320.76s/it] 

[I 2024-12-03 06:01:46,320] Trial 3 finished with value: 1433726000.0 and parameters: {'num_leaves': 57, 'learning_rate': 0.09897333211336987, 'min_data_in_leaf': 1469, 'feature_fraction': 0.7730273447469209, 'bagging_fraction': 0.7678752400890588}. Best is trial 3 with value: 1433726000.0.


Best trial: 3. Best value: 1.43373e+09:   0%|          | 4/1000 [24:45<102:44:27, 371.35s/it]


[W 2024-12-03 06:03:23,662] Trial 4 failed with parameters: {'num_leaves': 99, 'learning_rate': 0.11872350676416783, 'min_data_in_leaf': 1972, 'feature_fraction': 0.32655250974128536, 'bagging_fraction': 0.4054970438940536} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/n_alba_dscience/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_48888/2707597616.py", line 42, in objective
    cv_results = lgb.cv(
                 ^^^^^^^
  File "/home/n_alba_dscience/.venv/lib/python3.12/site-packages/lightgbm/engine.py", line 826, in cv
    cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/n_alba_dscience/.venv/lib/python3.12/site-packages/lightgbm/engine.py", line 402, in handler_function
    ret.append(getattr(booster, name)(*args, **kwargs))
               ^^^^^^^^^^^

KeyboardInterrupt: 

In [97]:
best_params = study.best_params
best_params

{'num_leaves': 94,
 'learning_rate': 0.050129899443463674,
 'min_data_in_leaf': 1004,
 'feature_fraction': 0.2530957925821187,
 'bagging_fraction': 0.28656832468240556}

In [100]:
best_trial = study.best_trial.user_attrs["best_iter"]
best_trial

300

#### Optimización para series temporales con TimeSeriesSplit  

#### Optimización para series temporales con TimeSeriesSplit y undersampling


In [None]:
best_iter =  study.best_trial.user_attrs["best_iter"]Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	cv_agg's valid gan_eval: 3.32697e+08 + 1.02725e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[89]	cv_agg's valid gan_eval: 3.72505e+08 + 2.92651e+06
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	cv_agg's valid gan_eval: 3.43602e+08 + 8.13274e+06
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	cv_agg's valid gan_eval: 3.27047e+08 + 1.04825e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	cv_agg's valid gan_eval: 3.58134e+08 + 4.73243e+06
Training until validation scores don't improve for 50 rounds
best_iter

In [None]:
best_params = study.best_params
best_params

In [None]:
# 6_ b_Optimización para series temporales con TimeSeriesSplit y undersampling

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
folds = list(tscv.split(X_train))

def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 200, 10000)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 8000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.3, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.3, 1.0)

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }

    # Asegurarse de que los datos estén ordenados cronológicamente
    X_train_sorted = X_train.sort_index()
    Y_train_sorted = Y_train.loc[X_train_sorted.index]
    w_train_sorted = w_train.loc[X_train_sorted.index]
    
    

    train_data = lgb.Dataset(
        X_train_sorted,
        label=Y_train_sorted,
        weight=w_train_sorted
    )

    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=5000,   # Un número muy alto inicial
        feval=lgb_gan_eval,
        folds=folds,
        stratified=False,
        seed=semillas[0],
        early_stopping_rounds=50  # Detenerse si no hay mejora en 50 rondas consecutivas
    )
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos la mejor iteración
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5

# Configurar el almacenamiento y el estudio de Optuna
storage_name = rf"sqlite:///{estudio_optuna}"
study_name = nombre_estudio

# Crear el estudio
study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

# Definir el callback para early stopping
class EarlyStoppingByImprovement:
    def __init__(self, patience: int):
        self.patience = patience
        self.best_value = None
        self.no_improvement_trials = 0

    def __call__(self, study, trial):
        current_best_value = study.best_trial.value

        if self.best_value is None or current_best_value > self.best_value:
            self.best_value = current_best_value
            self.no_improvement_trials = 0
        else:
            self.no_improvement_trials += 1

        if self.no_improvement_trials >= self.patience:
            print(f"Early stopping: No hay mejora en {self.patience} pruebas consecutivas.")
            study.stop()

# Crear el callback con paciencia de 100
early_stopping_callback = EarlyStoppingByImprovement(patience=100)

# Configurar el nivel de registro de Optuna (opcional)
optuna.logging.set_verbosity(optuna.logging.INFO)

# Optimizar el estudio con el callback personalizado
study.optimize(
    objective,
    n_trials=30000,
    callbacks=[early_stopping_callback]
)


## Voting GD

In [None]:
best_params = {'num_leaves': 4259,
 'learning_rate': 0.03459780778027798,
 'min_data_in_leaf': 2863,
 'feature_fraction': 0.7029328587219807,
 'bagging_fraction': 0.7914082839316137}

In [None]:
#Modelo 10 meses
# best_iter = 5
best_params = {'num_leaves': 3545,
 'learning_rate': 0.07959540527561224,
 'min_data_in_leaf': 933,
 'feature_fraction': 0.6829932939284065,
 'bagging_fraction': 0.6251908881121132}

In [None]:
# 7 - Voting promedio de modelos para reducir varianza
numeros_random = np.random.randint(0, 100000, 15)
train_data = lgb.Dataset(X_train,
                            label=Y_train,
                            weight=w_train)



df_voting = pd.DataFrame()

for semilla in numeros_random: 
        
    best_iter = 5
    df_modelos = pd.DataFrame
    params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': best_params['num_leaves'],
    'learning_rate': best_params['learning_rate'],
    'min_data_in_leaf': best_params['min_data_in_leaf'],
    'feature_fraction': best_params['feature_fraction'],
    'bagging_fraction': best_params['bagging_fraction'],
    'seed': semilla,
    'verbose': 0
    }
        
        
    model = lgb.train(params,
                    train_data,
                    num_boost_round=best_iter)
    
    y_pred = model.predict(X_test)
    
    df_voting[f'pred_{semilla}'] = y_pred
    
df_voting['pred_mean'] = df_voting.mean(axis=1)
    

In [None]:
df_voting

## Predicción

In [None]:
y_pred = df_voting['pred_mean']

predicciones = y_pred

X_test['Probabilidad'] = predicciones

tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)

In [None]:

tb_entrega['Predicted'] = 0

envios = 9730
tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1

resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)

print("Cantidad de clientes {}".format(envios))
num_subida_kaggle = 18
nombre_archivo = '\entrega_0{}.csv'.format(num_subida_kaggle)
entrega_final = f'{entregas_l}{nombre_archivo}'
resultados.to_csv(entrega_final, index=False)

In [None]:
numero_de_cliente = X_test['numero_de_cliente']

In [None]:

nombres = model.feature_name()
df_resultado = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente,
    'Predicted': y
}, index=X_test.index)

## Entrega Kaggle

In [None]:
#Ingresar el path sobre el cual se quiere ingresar el archivo a entregar
entrega = 'entrega_kaggle'
formato = '.csv'
numero = rf"\{entrega}_002{formato}"
entrega_final = df_resultado.to_csv(rf"{entregas_l}{numero}", index=False)
entrega_final_path = entregas_l + numero


In [None]:
cantidad_columnas = 298
message = f"{entrega}, cantidad de columnas en el train: {cantidad_columnas}, modelo: LGBM, mejores parametros: {best_params}, mejor iteracion: {best_iter}"

<!-- ### Kaggle -->

In [None]:

competition= 'DMEyF-2024-Segunda'

In [None]:
kaggle.api.competition_submit(competition=competition, file_name= entrega_final_path, message=message, quiet=False)

### Puntos de corte

In [None]:

predicciones = y_pred_lgm

X_test['Probabilidad'] = predicciones

tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)

cortes = range(9000,14000,100)

num_subida_kaggle = 65
for envios in cortes:
    
    tb_entrega['Predicted'] = 0
    tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1
    resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)
    
    print("Cantidad de clientes {}".format(envios))
    
    nombre_archivo = 'entrega_0{}.csv'.format(num_subida_kaggle)
    entrega_final = os.path.join(path, nombre_archivo)
    resultados.to_csv(entrega_final, index=False)
    
    
    cantidad_columnas = df_train.shape[1]
    message = f"{entrega}, cantidad de columnas en el train: {cantidad_columnas}, modelo: LGBM, mejores parametros: {best_params}, mejor iteracion: {best_iter}, archivo: {entrega_final}, punto de corte: {envios}, optimizado con optuna: {study_name}"
    
    num_subida_kaggle += 1
    
    
    
    entrega_final = os.path.join(path, nombre_archivo)   
    competencia = 'dm-ey-f-2024-primera'
    try:
        api.competition_submit(file_name=entrega_final,message=message,competition=competencia)
    except:
        print(f"Numero máximo de envios, último envio ={num_subida_kaggle}")

In [None]:
# df_train.shape

In [None]:
# nombre_modelo = 'lgbm_e_en_abril_p_en_junio_451_features.txt'
# model.save_model(rf"C:\Users\Admin\Documents\1_Notebook\1_Estudio\1_UBA_Maestria_DS\1_Especializacion\1_Segundo_Semestre\DMEyF\modelos_lgbm\{nombre_modelo}")