In [1]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML

df = pd.read_parquet('baseline_with_lags.parquet')

In [2]:
df['foto_mes'] = pd.to_datetime(df['foto_mes'], format='%Y%m')

In [3]:
df_val = df[df['foto_mes']=='2021-07-01']
df_train = df[df['foto_mes']<'2021-07-01']

In [4]:
df_val.shape, df_train.shape

((164682, 374), (1460661, 374))

In [5]:
X_train = df_train.drop(columns=['target'])
X_val = df_val.drop(columns=['target'])
y_train = df_train['target']
y_val = df_val['target']

In [6]:
%%time

automl = AutoML(
    total_time_limit=3600*6,
    mode='Compete',
    stack_models=False,
    random_state=158151,
    train_ensemble=False,
    eval_metric='f1',
)
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: AutoML_3
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.994884 trained in 371.06 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 2 models
2_DecisionTree f1 0.994925 trained in 368.59 seconds
3_DecisionTree f1 0.994925 trained in 363.56 seconds
* Step default_algorithms will try to check up to 6 models
4_Default_LightGBM f1 0.994947 trained in 271.88 seconds




5_Default_Xgboost f1 0.994942 trained in 263.75 seconds
6_Default_CatBoost f1 0.99503 trained in 255.22 seconds
7_Default_NeuralNetwork f1 0.994884 trained in 2044.66 seconds
8_Default_RandomForest f1 0.994908 trained in 777.8 seconds
9_Default_ExtraTrees f1 0.994925 trained in 789.25 seconds
* Step not_so_random will try to check up to 54 models
19_LightGBM f1 0.994954 trained in 250.05 seconds




10_Xgboost f1 0.994986 trained in 285.06 seconds
28_CatBoost f1 0.995006 trained in 264.48 seconds
37_RandomForest f1 0.994908 trained in 791.13 seconds
46_ExtraTrees f1 0.994949 trained in 625.92 seconds
55_NeuralNetwork f1 0.994884 trained in 1927.27 seconds
20_LightGBM f1 0.99495 trained in 265.58 seconds




11_Xgboost f1 0.995016 trained in 291.83 seconds
29_CatBoost f1 0.99506 trained in 284.24 seconds
38_RandomForest f1 0.994952 trained in 1604.35 seconds
47_ExtraTrees f1 0.994962 trained in 701.89 seconds
56_NeuralNetwork f1 0.994884 trained in 2735.06 seconds
Skip mix_encoding because of the time limit.
Skip golden_features because of the time limit.
Not enough time to perform features selection. Skip
Time needed for features selection ~ 5352.0 seconds
Please increase total_time_limit to at least (53581 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 35 models
57_CatBoost f1 0.99505 trained in 277.64 seconds
58_CatBoost f1 0.995006 trained in 249.33 seconds
59_CatBoost f1 0.995016 trained in 253.37 seconds




60_Xgboost f1 0.99502 trained in 272.91 seconds
61_CatBoost f1 0.994962 trained in 255.85 seconds
62_CatBoost f1 0.995013 trained in 264.94 seconds




63_Xgboost f1 0.994959 trained in 267.13 seconds




64_Xgboost f1 0.994972 trained in 275.1 seconds
65_ExtraTrees f1 0.994955 trained in 763.04 seconds
66_ExtraTrees f1 0.994976 trained in 716.63 seconds
67_LightGBM f1 0.99494 trained in 259.65 seconds
68_LightGBM f1 0.994912 trained in 260.76 seconds
* Step hill_climbing_2 will try to check up to 31 models
69_CatBoost f1 0.99504 trained in 286.94 seconds
70_CatBoost f1 0.995071 trained in 269.01 seconds
71_CatBoost f1 0.995057 trained in 308.35 seconds
72_CatBoost f1 0.99505 trained in 257.96 seconds
73_CatBoost f1 0.995037 trained in 275.11 seconds
74_CatBoost f1 0.995043 trained in 247.48 seconds
84_LightGBM f1 0.99495 trained in 256.29 seconds
AutoML fit time: 21707.2 seconds
AutoML best model: 70_CatBoost
CPU times: user 3d 6h 24min 49s, sys: 2d 7h 55min 47s, total: 5d 14h 20min 36s
Wall time: 6h 2min 1s


In [7]:
preds = automl.predict_proba(X_val)
preds=preds[:,0]
y_val = np.where(y_val=='B2', 1, 0)

In [8]:
def ganancia_integral(probs: np.array, 
                      y_true: np.array) -> float:
    ganancia_df = pd.DataFrame({'prob': probs, 'truth': y_true})
    ganancia_df = ganancia_df.sort_values(by='prob', ascending=False).reset_index(drop=True)
    ganancia_df['ganancia_ind'] = np.where(ganancia_df['truth']==1, 273, -7)
    ganancia_df['ganancia_roll'] = ganancia_df['ganancia_ind'].cumsum()
    gan_max = ganancia_df['ganancia_roll'].max()
    ind = ganancia_df[ganancia_df['ganancia_roll']==gan_max].index[0]
    print(ind)
    res = np.mean(ganancia_df['ganancia_roll'].iloc[ind-1000:ind+1001])
    return res

In [9]:
# target engineering value

ganancia_integral(preds, y_val)

19098


377248.5092453773

In [None]:
# True value
ganancia_integral(preds, y_val)