In [11]:
import pandas as pd
from pycaret.classification import (
    setup, compare_models, tune_model, calibrate_model,
    finalize_model, save_model, pull, get_config, get_leaderboard, optimize_threshold
)

In [2]:
datos = pd.read_parquet("data/entrenamiento.parquet")

In [3]:
configuracion = setup(
    data=datos,
    target="is_attributed",
    session_id=42,
    train_size=0.8,
    fold=3,
    fold_strategy="stratifiedkfold",
    data_split_stratify=True,
    fix_imbalance=True,
    preprocess=True,

    # seleccion de caracteriscticas
    feature_selection=True,
    feature_selection_method="classic",
    feature_selection_estimator="lightgbm",
    n_features_to_select=0.2,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95,
    low_variance_threshold=0.0,  # 0 elimina cols constantes
    rare_to_value=0.01,  # combina categorías <1% de frecuencia
    rare_value="rare",

    use_gpu=False,  # hardware
    verbose=True  # logs
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,is_attributed
2,Target type,Binary
3,Original data shape,"(100000, 15)"
4,Transformed data shape,"(179636, 3)"
5,Transformed train set shape,"(159636, 3)"
6,Transformed test set shape,"(20000, 3)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


In [5]:
modelos_a_probar = ["lr", "rf", "xgboost"]
modelos_a_excluir = ["lightgbm"]

# modelo_base = compare_models(sort="AUC", turbo=True, exclude=["lightgbm"])
modelo_base = compare_models(
    sort="AUC",
    turbo=True,
    include=modelos_a_probar,
    # exclude=modelos_a_excluir,
    fold=3
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.801,0.7653,0.6653,0.0081,0.016,0.0115,0.0575,37.25
xgboost,Extreme Gradient Boosting,0.9721,0.697,0.238,0.0215,0.0394,0.0353,0.0645,39.59
rf,Random Forest Classifier,0.9861,0.6748,0.1328,0.0315,0.0506,0.047,0.0594,41.9967


In [7]:
modelo_base

In [8]:
modelo_ajustado = tune_model(
    estimator=modelo_base,
    optimize="F1",
    choose_better=True,
    n_iter=5,
    search_library="optuna"
)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7814,0.744,0.6393,0.0067,0.0132,0.0087,0.0486
1,0.7647,0.6975,0.6066,0.0059,0.0117,0.0072,0.0418
2,0.857,0.8546,0.75,0.0117,0.0231,0.0187,0.0819
Mean,0.801,0.7653,0.6653,0.0081,0.016,0.0115,0.0575
Std,0.0402,0.0659,0.0614,0.0026,0.0051,0.0051,0.0175


[LightGBM] [Info] Number of positive: 53212, number of negative: 53212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 106424, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 53212, number of negative: 53212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 106424, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

In [9]:
modelo_calibrado = calibrate_model(
    estimator=modelo_ajustado,
    method="isotonic"
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8264,0.7331,0.5902,0.0078,0.0153,0.0108,0.0526
1,0.8248,0.6916,0.5574,0.0073,0.0143,0.0099,0.0481
2,0.7841,0.8535,0.8,0.0083,0.0164,0.012,0.0671
Mean,0.8118,0.7594,0.6492,0.0078,0.0154,0.0109,0.0559
Std,0.0196,0.0686,0.1075,0.0004,0.0008,0.0009,0.0081


In [12]:
umbral_optimo = optimize_threshold(
    estimator=modelo_calibrado,
    optimize="F1"
)

Threshold: 0.375. F1: 0.0107
Threshold: 0.125. F1: 0.0061
Threshold: 0.5. F1: 0.0154
Threshold: 0.25. F1: 0.0064
Threshold: 0.625. F1: 0.0212
Threshold: 0.75. F1: 0.0298
Threshold: 0.875. F1: 0.0351
Threshold: 0.0. F1: 0.0045
Threshold: 0.875. F1: 0.0351
Threshold: 0.8750000149011612. F1: 0.0351
Threshold: 0.21875. F1: 0.0062
Threshold: 0.09375. F1: 0.006
Threshold: 0.3125. F1: 0.0066
Threshold: 0.1875. F1: 0.0061


KeyboardInterrupt: 

In [None]:
modelo_final = finalize_model(modelo_calibrado)
save_model(modelo_final, "modelo_final")

In [None]:
X_entrenamiento = get_config("X_train")
cols_usadas = list(X_entrenamiento.columns)
print("Cols finales:", cols_usadas)