## Librerías

In [1]:
%%capture
%run "../../recurrentes.ipynb"

In [2]:
%%capture
%run "../../funciones.ipynb"

In [3]:
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [4]:
df_train = pd.read_parquet(dataset_competencia_03_lags_deltas_y_clase_ternaria_l)

In [5]:
df_train.shape

(4901237, 474)

In [6]:
lista_columnas = df_train.columns
listas_drop = [col for col in lista_columnas if 'lag' in col]
df_train.drop(columns=listas_drop, inplace=True)

In [7]:
df_train.shape

(4901237, 320)

In [8]:
lista_drop_meses = [201901, 201902, 201903, 201905, 201907, 201911, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109]
df_train = df_train[df_train['foto_mes'].isin(lista_drop_meses)]
df_train.shape

(2574489, 320)

In [9]:
df_train['foto_mes'].unique()

array([201901, 201902, 201903, 201905, 201907, 201911, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108,
       202109], dtype=int64)

In [10]:
columnas_num = df_train.select_dtypes(exclude=['bool','object'])
lista_columnas_num = columnas_num.columns

df_train['max'] = df_train[lista_columnas_num].max(axis=1)

In [11]:
variables = [
    'cmobile_app_trx', 'Master_cconsumos', 'ctarjeta_debito_transacciones',
    'ctarjeta_visa_transacciones', 'ctarjeta_master_transacciones', 'cpayroll_trx',
    'cpayroll2_trx', 'ccuenta_debitos_automaticos', 'cpagodeservicios', 'cforex',
    'cforex_buy', 'mforex_sell', 'cextraccion_autoservicio',
    'ccallcenter_transacciones', 'chomebanking_transacciones'
]
if 'ctrx_quarter' not in df_train.columns:
    raise KeyError("La variable 'ctrx_quarter' no existe en el dataset.")
variables_presentes = [var for var in variables if var in df_train.columns]
if variables_presentes:
    for var in variables_presentes:
        denominator = df_train[var].replace(0, 1e-6) ** 2
        df_train['power_ratio_' + var] = df_train['ctrx_quarter'] / denominator
    variables_faltantes = [var for var in variables if var not in df_train.columns]
    if variables_faltantes:
        print(f"Las siguientes variables no existen en el dataset y no se calcularon: {variables_faltantes}")
else:
    print("Ninguna de las variables especificadas existe en el dataset.")

In [12]:
df_train.shape

(2574489, 336)

## Preparación del dataset para el Voting GD

In [13]:
if 'T_Visa_normal' in df_train.columns:
    df_train['T_Visa_normal'] = df_train['T_Visa_normal'].astype(bool)
if 'T_Master_normal'in df_train.columns:
    df_train['T_Master_normal'] = df_train['T_Master_normal'].astype(bool)


In [14]:
# 4 - parametros para modelo
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train = [201901, 201902, 201903, 201905, 201907, 201911, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107]
mes_test = 202109


best_params = {'num_leaves': 8869, 'learning_rate': 0.00720758269649222, 'min_data_in_leaf': 3916, 'feature_fraction': 0.361029829884323, 'bagging_fraction': 0.6481560136078847}
best_iter = 1160

In [15]:
data = df_train

In [16]:
data['tmobile_app'] = pd.to_numeric(data['tmobile_app'], errors='coerce')  # Convierte no numéricos a NaN
data['tmobile_app'] = data['tmobile_app'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano



data['cmobile_app_trx'] = pd.to_numeric(data['cmobile_app_trx'], errors='coerce')  # Convierte no numéricos a NaN
data['cmobile_app_trx'] = data['cmobile_app_trx'].fillna(0).astype('bool')  # Llena los NaN y convierte a booleano

In [17]:

data['clase_peso'] = 1.0
data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001
data['clase_binaria'] = np.where(data['clase_ternaria']=='BAJA+2', 1, 0)
df_test = data[data['foto_mes'] == mes_test]
df_train = data[data['foto_mes'].isin(mes_train)]

clase_peso = df_train['clase_peso']
X_train = df_train.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_train =df_train['clase_binaria']
X_test = df_test.drop(['clase_ternaria', 'clase_binaria', 'clase_peso'], axis=1)
Y_test =df_test['clase_binaria']
w_train = df_train.loc[X_train.index, 'clase_peso']


In [18]:
del data 

## Semillero

In [19]:
df_voting = pd.DataFrame()


train_data = lgb.Dataset(X_train, label=Y_train, weight=w_train)

for x in semillas:

    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': best_params['num_leaves'],
        'learning_rate': best_params['learning_rate'],
        'min_data_in_leaf': best_params['min_data_in_leaf'],
        'feature_fraction': best_params['feature_fraction'],
        'bagging_fraction': best_params['bagging_fraction'],
        'seed': x,  # Usamos el x actual del bucle
        'verbose': 0
    }


    # Entrenar el modelo con los parámetros actualizados
    model = lgb.train(params,
                      train_data,
                      num_boost_round=best_iter)

    y_pred = model.predict(X_test)
    df_voting[f'prediccion_seed_{x}'] = y_pred

df_voting['prediccion'] = df_voting.mean(axis=1)
df_voting.index = X_test.index
    



In [20]:
df_voting['max'] = df_voting.max(axis=1)

## Predicción

In [22]:
prediccion = pd.DataFrame({'numero_de_cliente': X_test['numero_de_cliente'], 'probabilidad': df_voting ['prediccion']}, index=X_test.index) 
prediccion.shape 

(165644, 2)

In [23]:
y_pred = prediccion['probabilidad']

predicciones = y_pred

X_test['Probabilidad'] = predicciones

tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)

In [36]:

tb_entrega['Predicted'] = 0

envios = 11000
tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1

resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)

print("Cantidad de clientes {}".format(envios))
num_subida_kaggle = 9
nombre_archivo = '\comptencia_03_entrega_0{}.csv'.format(num_subida_kaggle)
entrega_final = f'{entregas_l}{nombre_archivo}'
resultados.to_csv(entrega_final, index=False)

Cantidad de clientes 11000
