In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import seaborn as sns

from sklearn.preprocessing import Normalizer, Binarizer, MinMaxScaler, \
    QuantileTransformer, StandardScaler, KernelCenterer, RobustScaler
from sklearn.model_selection import train_test_split

import autogluon.core as ag
from autogluon import TabularPrediction as task

# Data Preparation

## Acquire Data

In [27]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\train.csv"])
file_test= os.path.sep.join(["..\\data\\test.csv"])

df = pd.read_csv(file, index_col=0)
df_eval = pd.read_csv(file_test, index_col=0)

df.columns = df.columns.str.strip()
df_eval.columns = df_eval.columns.str.strip()

del df['BUTTER']
del df_eval['BUTTER']

print(df.columns)

Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV', 'piminus_P', 'gamma_PT', 'piminus_ETA',
       'Kplus_ETA', 'signal'],
      dtype='object')


In [40]:
def preprocess_data(df):
    df2 = df.copy()
    
#     Not using EXP / LOG for now
#     for col in df.columns[:-1]:
#         df2['EXP_'+col] = np.exp(df[col])
#         df2['LOG_'+col] = np.log(df[col])

    # TODO: idea add a new variable that is the P(signal) for that particular line
    # I.e. always P(signal=1)
    
#     for col in df.columns:
#         if col!="signal":
#             df[f"{col}__SQR"] = df[col] ** 2
    
    done = set()
    for col1 in df.columns:
        if col1!="signal":
            for col2 in df.columns:
                if col2!="signal":
                    colname = '__X__'.join(sorted((col1,col2)))
                    if col1 != col2 and colname not in done:
                        df2[colname] = df[col1] * df[col2]
                        done.add(colname)
    
    return df2

In [29]:
df2=preprocess_data(df)
df2.columns

Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV',
       ...
       'piminus_IP_OWNPV__X__piminus_P', 'gamma_PT__X__piminus_IP_OWNPV',
       'piminus_ETA__X__piminus_IP_OWNPV', 'Kplus_ETA__X__piminus_IP_OWNPV',
       'gamma_PT__X__piminus_P', 'piminus_ETA__X__piminus_P',
       'Kplus_ETA__X__piminus_P', 'gamma_PT__X__piminus_ETA',
       'Kplus_ETA__X__gamma_PT', 'Kplus_ETA__X__piminus_ETA'],
      dtype='object', length=106)

# AutoGluon Inital

In [52]:
train_data = task.Dataset(train)
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)

In [60]:
label_column = 'signal'
print("Summary of class variable: \n", train_data[label_column].describe())

Summary of class variable: 
 count    210568.000000
mean          0.334001
std           0.471641
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: signal, dtype: float64


In [30]:
dir = current_path+"\\trained_models"
predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir)

Beginning AutoGluon training ...
AutoGluon will save models to C:\Users\garciagr\OneDrive - HP Inc\DSUB\MachineLearning\Boson\2020.ml.kaggle\Marc\\trained_models\
AutoGluon Version:  0.0.14
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    39187.19 MB
	Train Data (Original)  Memory Usage: 0.06 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.


In [32]:
test_data = task.Dataset(test)
y_test = test_data[label_column]  # values to predict
test_data_nolab = test_data.drop(labels=[label_column],axis=1)  # delete label column to prove we're not cheating

In [39]:
predictor = task.load(dir)  # unnecessary, just demonstrates how to load previously-trained predictor from file

y_pred = predictor.predict(test_data_nolab)
print("Predictions:  ", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.7050730063646574
Evaluations on test data:
{
    "accuracy": 0.7050730063646574,
    "accuracy_score": 0.7050730063646574,
    "balanced_accuracy_score": 0.5768322523659386,
    "matthews_corrcoef": 0.2510302944295476,
    "f1_score": 0.7050730063646574
}
Detailed (per-class) classification report:
{
    "0.0": {
        "precision": 0.7050793323717288,
        "recall": 0.9595485137408861,
        "f1-score": 0.812863760541632,
        "support": 14264
    },
    "1.0": {
        "precision": 0.7050102249488752,
        "recall": 0.194115990990991,
        "f1-score": 0.30441501103752766,
        "support": 7104
    },
    "accuracy": 0.7050730063646574,
    "macro avg": {
        "precision": 0.705044778660302,
        "recall": 0.5768322523659386,
        "f1-score": 0.5586393857895798,
        "support": 21368
    },
    "weighted avg": {
        "precision": 0.7050563569350033,
        "recall": 0.7050730063646574,
        "f1-score": 0.6438250

Predictions:   [0 1 1 ... 0 0 0]


In [35]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesClassifierEntr,0.739985,0.71,0.224589,0.105948,0.302738,0.224589,0.105948,0.302738,0,True,4
1,ExtraTreesClassifierGini,0.739798,0.71,0.226053,0.105911,0.345051,0.226053,0.105911,0.345051,0,True,3
2,LightGBMClassifierXT,0.739657,0.74,0.049918,0.003709,0.274369,0.049918,0.003709,0.274369,0,True,8
3,RandomForestClassifierGini,0.738628,0.7,0.114872,0.105667,0.337162,0.114872,0.105667,0.337162,0,True,1
4,RandomForestClassifierEntr,0.737973,0.69,0.123669,0.105312,0.327921,0.123669,0.105312,0.327921,0,True,2
5,CatboostClassifier,0.737364,0.72,0.008056,0.003978,0.9158,0.008056,0.003978,0.9158,0,True,9
6,NeuralNetClassifier,0.718925,0.72,0.524147,0.012079,5.260593,0.524147,0.012079,5.260593,0,True,10
7,LightGBMClassifierCustom,0.70994,0.73,0.015163,0.002543,0.3542,0.015163,0.002543,0.3542,0,True,11
8,weighted_ensemble_k0_l1,0.705073,0.77,0.024959,0.006476,0.883696,0.003812,0.0,0.272763,1,True,12
9,LightGBMClassifier,0.704558,0.76,0.005984,0.003932,0.256732,0.005984,0.003932,0.256732,0,True,7


# AutoGluon Maxime accuracy

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df2, df2.signal, test_size=0.10, random_state=0)

train_data = task.Dataset(X_train)

test_data = task.Dataset(X_test)
label_column="signal"
output_directory=current_path+ "\\trained_models"

In [33]:
#time_limits = 36000 # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
metric = 'roc_auc' # specify your evaluation metric here
#presets="medium_quality_faster_train" # produces less accurate models but facilitates faster prototyping
presets='best_quality'
predictor = task.fit(train_data=train_data, label=label_column,
                     eval_metric=metric, presets=presets,output_directory=output_directory,auto_stack =True)

Beginning AutoGluon training ...
AutoGluon will save models to D:\Uni\2020.ml.kaggle\Marc\\trained_models\
AutoGluon Version:  0.0.14
Train Data Rows:    191395
Train Data Columns: 105
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    28104.05 MB
	Train Data (Original)  Memory Usage: 167.55 MB (0.6% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenera

In [34]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetClassifier_STACKER_l0,0.92894,0.924003,10.261096,9.282063,11066.334578,10.261096,9.282063,11066.334578,0,True,10
1,weighted_ensemble_k0_l1,0.92859,0.924169,12.4096,10.880788,11636.095376,0.012044,0.065799,73.334394,1,True,12
2,CatboostClassifier_STACKER_l1,0.928463,0.924272,172.541487,61.862612,16066.165506,0.158302,0.100269,48.869944,1,True,21
3,weighted_ensemble_k0_l2,0.928434,0.924573,201.437364,76.315503,21410.422547,0.005983,0.059843,68.420963,2,True,24
4,LightGBMClassifierXT_STACKER_l1,0.92824,0.924155,173.2748,62.354315,16064.480061,0.891615,0.591972,47.184499,1,True,20
5,LightGBMClassifier_STACKER_l1,0.928217,0.924303,173.087303,62.180227,16055.356868,0.704118,0.417884,38.061306,1,True,19
6,NeuralNetClassifier_STACKER_l1,0.928121,0.922694,182.788095,69.657535,19988.34092,10.40491,7.895192,3971.045358,1,True,22
7,LightGBMClassifierCustom_STACKER_l1,0.927901,0.923987,173.834361,62.624161,16119.800463,1.451175,0.861819,102.504901,1,True,23
8,RandomForestClassifierEntr_STACKER_l1,0.921368,0.91839,186.106546,66.221388,17539.010718,13.723361,4.459045,1521.715156,1,True,14
9,ExtraTreesClassifierEntr_STACKER_l1,0.9193,0.91762,206.683177,68.113938,16643.588454,34.299991,6.351596,626.292892,1,True,16


## Add aditional ensembles

In [18]:
additional_ensembles = predictor.fit_weighted_ensemble(expand_pareto_frontier=True)
print("Alternative ensembles you can use for prediction:", additional_ensembles)

predictor.leaderboard(only_pareto_frontier=True, silent=True)

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\utils\data\X_train.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\utils\data\y_train.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierUnif_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierDist_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l

	0.915	 = Validation roc_auc score
	66.93s	 = Training runtime
	0.06s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Fitting model: weighted_ensemble_custom_pareto10_k0_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto10_k0_l1\utils\model_template.pkl
Ensemble size: 57
Ensemble weights: 
[0.96491228 0.03508772 0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto10_k0_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto10_k0_l1\model.pkl
	0.915	 = Validation roc_auc score
	73.39s	 = Training runtime
	0.06s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Fitting model: weighted_ensemble_custom_pareto11_k0_l2 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_e

	0.9153	 = Validation roc_auc score
	134.27s	 = Training runtime
	0.07s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Fitting model: weighted_ensemble_custom_k0_l2 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_k0_l2\utils\model_template.pkl
Ensemble size: 73
Ensemble weights: 
[0.23287671 0.15068493 0.1369863  0.17808219 0.06849315 0.12328767
 0.08219178 0.02739726 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_k0_l2\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_k0_l2\model.pkl
	0.9153	 = Validation roc_auc score
	139.98s	 = Training runtime
	0.06s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl


Alternative ensembles you can use for prediction: ['weighted_ensemble_custom_pareto1_k0_l1', 'weighted_ensemble_custom_pareto2_k0_l1', 'weighted_ensemble_custom_pareto3_k0_l1', 'weighted_ensemble_custom_pareto4_k0_l1', 'weighted_ensemble_custom_pareto5_k0_l1', 'weighted_ensemble_custom_pareto6_k0_l1', 'weighted_ensemble_custom_pareto7_k0_l1', 'weighted_ensemble_custom_pareto8_k0_l1', 'weighted_ensemble_custom_pareto9_k0_l1', 'weighted_ensemble_custom_pareto10_k0_l1', 'weighted_ensemble_custom_pareto11_k0_l2', 'weighted_ensemble_custom_pareto12_k0_l2', 'weighted_ensemble_custom_pareto13_k0_l2', 'weighted_ensemble_custom_pareto14_k0_l2', 'weighted_ensemble_custom_pareto15_k0_l2', 'weighted_ensemble_custom_pareto16_k0_l2', 'weighted_ensemble_custom_pareto17_k0_l2', 'weighted_ensemble_custom_pareto18_k0_l2', 'weighted_ensemble_custom_pareto19_k0_l2', 'weighted_ensemble_custom_pareto20_k0_l2', 'weighted_ensemble_custom_k0_l2']


Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,weighted_ensemble_custom_k0_l2,0.915336,60.588694,9204.402478,0.063801,139.978314,2,True,45
1,weighted_ensemble_custom_pareto20_k0_l2,0.915336,60.473552,9195.161349,0.065026,134.269739,2,True,44
2,weighted_ensemble_custom_pareto19_k0_l2,0.915335,59.350921,9461.867956,0.062866,126.87881,2,True,43
3,weighted_ensemble_custom_pareto18_k0_l2,0.915313,54.540925,6816.983227,0.063772,120.053896,2,True,42
4,weighted_ensemble_custom_pareto17_k0_l2,0.915313,50.323075,6235.013777,0.06478,113.291757,2,True,41
5,weighted_ensemble_custom_pareto14_k0_l2,0.915284,46.237116,5456.276426,0.063834,93.461995,2,True,38
6,weighted_ensemble_custom_pareto13_k0_l2,0.915276,45.549471,5426.519485,0.064145,93.572446,2,True,37
7,weighted_ensemble_custom_pareto12_k0_l2,0.915237,45.159443,5404.75932,0.062866,87.062913,2,True,36
8,weighted_ensemble_custom_pareto11_k0_l2,0.915179,44.945945,5386.567959,0.065399,79.931511,2,True,35
9,CatboostClassifier_STACKER_l1,0.91509,44.880546,5306.636448,0.069749,38.222966,1,True,21


## Collapsing bagged ensembles via refit_full

In [19]:
refit_model_map = predictor.refit_full()
print("Name of each refit-full model corresponding to a previous bagged ensemble:")
print(refit_model_map)
predictor.leaderboard(test_data, silent=True)

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\utils\data\X_train.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\utils\data\y_train.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\utils\model_template.pkl
Fitting model: RandomForestClassifierGini_FULL_STACKER_l0 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_FULL_STACKER_l0\utils\model_template.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_FULL_STACKER_l0\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_FULL_STACKER_l0\model.pkl
	36.51s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierEntr_STACKER_l0\model.pkl
Loadin

[50]	train_set's binary_logloss: 0.456131
[100]	train_set's binary_logloss: 0.433076
[150]	train_set's binary_logloss: 0.419268
[200]	train_set's binary_logloss: 0.409607
[250]	train_set's binary_logloss: 0.401856
[300]	train_set's binary_logloss: 0.395259
[350]	train_set's binary_logloss: 0.389263
[400]	train_set's binary_logloss: 0.383695
[450]	train_set's binary_logloss: 0.378566
[500]	train_set's binary_logloss: 0.373619


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_FULL_STACKER_l0\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_FULL_STACKER_l0\model.pkl
	4.28s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l0\utils\model_template.pkl
Fitting model: LightGBMClassifierXT_FULL_STACKER_l0 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_FULL_STACKER_l0\utils\model_template.pkl
Training Gradient Boosting Model for 911 rounds...
with the following hyperparameter settings:
{'num_threads': -1, 'objective': 'binary', 'verbose': -1, 'boosting_type': 'gbdt', 'two_round': True, 'extra_trees': True}


[50]	train_set's binary_logloss: 0.467648
[100]	train_set's binary_logloss: 0.448989
[150]	train_set's binary_logloss: 0.438557
[200]	train_set's binary_logloss: 0.431028
[250]	train_set's binary_logloss: 0.425206
[300]	train_set's binary_logloss: 0.420399
[350]	train_set's binary_logloss: 0.416184
[400]	train_set's binary_logloss: 0.412485
[450]	train_set's binary_logloss: 0.409174
[500]	train_set's binary_logloss: 0.406185
[550]	train_set's binary_logloss: 0.403424
[600]	train_set's binary_logloss: 0.400716
[650]	train_set's binary_logloss: 0.398153
[700]	train_set's binary_logloss: 0.395703
[750]	train_set's binary_logloss: 0.39332
[800]	train_set's binary_logloss: 0.391082
[850]	train_set's binary_logloss: 0.388817
[900]	train_set's binary_logloss: 0.386674


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_FULL_STACKER_l0\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_FULL_STACKER_l0\model.pkl
	7.1s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\model_template.pkl
Fitting model: CatboostClassifier_FULL_STACKER_l0 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_FULL_STACKER_l0\utils\model_template.pkl
	Catboost model hyperparameters: {'iterations': 1129, 'learning_rate': 0.1, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'Logloss'}


0:	learn: 0.6579376	total: 33.7ms	remaining: 38s
20:	learn: 0.4955505	total: 661ms	remaining: 34.9s
40:	learn: 0.4769051	total: 1.28s	remaining: 34s
60:	learn: 0.4667674	total: 1.87s	remaining: 32.8s
80:	learn: 0.4598811	total: 2.53s	remaining: 32.8s
100:	learn: 0.4552625	total: 3.1s	remaining: 31.5s
120:	learn: 0.4510204	total: 3.66s	remaining: 30.5s
140:	learn: 0.4473008	total: 4.23s	remaining: 29.6s
160:	learn: 0.4435186	total: 4.82s	remaining: 29s
180:	learn: 0.4403087	total: 5.4s	remaining: 28.3s
200:	learn: 0.4373333	total: 5.97s	remaining: 27.6s
220:	learn: 0.4345257	total: 6.55s	remaining: 26.9s
240:	learn: 0.4320705	total: 7.15s	remaining: 26.3s
260:	learn: 0.4296779	total: 7.76s	remaining: 25.8s
280:	learn: 0.4275728	total: 8.34s	remaining: 25.2s
300:	learn: 0.4255422	total: 8.96s	remaining: 24.6s
320:	learn: 0.4236210	total: 9.58s	remaining: 24.1s
340:	learn: 0.4218121	total: 10.2s	remaining: 23.6s
360:	learn: 0.4200025	total: 10.8s	remaining: 22.9s
380:	learn: 0.4184038	tot

Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_FULL_STACKER_l0\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_FULL_STACKER_l0\model.pkl
	34.85s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\model.pkl


1128:	learn: 0.3791943	total: 34.6s	remaining: 0us


Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\utils\model_template.pkl
Fitting model: NeuralNetClassifier_FULL_STACKER_l0 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_FULL_STACKER_l0\utils\model_template.pkl
AutoGluon Neural Network infers features are of the following types:
{
    "continuous": [
        "                   B_OWNPV_CHI2 ",
        "                 B_IPCHI2_OWNPV ",
        "            Kst_892_0_cosThetaH ",
        "                    piminus_ETA ",
        "                      Kplus_ETA "
    ],
    "skewed": [
        "                 B_FDCHI2_OWNPV ",
        "                   B_DIRA_OWNPV ",
        "                           B_PT ",
        "             Kst_892_0_IP_OWNPV ",
        "                 Kplus_IP_OWNPV ",
        "                        Kplus_P ",
        "               piminus_IP_OWNPV ",
        "                      piminus_P ",
        "                  

[50]	train_set's binary_logloss: 0.485876
[100]	train_set's binary_logloss: 0.448525
[150]	train_set's binary_logloss: 0.428463
[200]	train_set's binary_logloss: 0.413934
[250]	train_set's binary_logloss: 0.401898
[300]	train_set's binary_logloss: 0.391681
[350]	train_set's binary_logloss: 0.382791
[400]	train_set's binary_logloss: 0.374797
[450]	train_set's binary_logloss: 0.367446
[500]	train_set's binary_logloss: 0.360713
[550]	train_set's binary_logloss: 0.354344
[600]	train_set's binary_logloss: 0.348322
[650]	train_set's binary_logloss: 0.342602
[700]	train_set's binary_logloss: 0.337087
[750]	train_set's binary_logloss: 0.331782


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_FULL_STACKER_l0\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_FULL_STACKER_l0\model.pkl
	8.84s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_k0_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_k0_l1\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\oof.pkl
Fitting model: weighted_ensemble_FULL_k0_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\utils\model_template.pkl
Ensemble size: 57
Ensemble weights: 
[0.96491228 0.03508772]
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_STACKER_l0\utils\oof.pkl
Fitting model: ExtraTreesClassifierEntr_FULL_STACKER_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_FULL_STACKER_l1\utils\model_template.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_FULL_STACKER_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_FULL_STACKER_l1\model.pkl
	34.35s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifier

[50]	train_set's binary_logloss: 0.339365


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_FULL_STACKER_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_FULL_STACKER_l1\model.pkl
	1.19s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l1\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\tr

[50]	train_set's binary_logloss: 0.345166
[100]	train_set's binary_logloss: 0.339563


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_FULL_STACKER_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_FULL_STACKER_l1\model.pkl
	1.62s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l1\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\tr

0:	learn: 0.6044329	total: 26.1ms	remaining: 2.58s
20:	learn: 0.3505375	total: 686ms	remaining: 2.58s
40:	learn: 0.3447128	total: 1.29s	remaining: 1.86s
60:	learn: 0.3434010	total: 1.85s	remaining: 1.18s
80:	learn: 0.3425795	total: 2.42s	remaining: 567ms


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_FULL_STACKER_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_FULL_STACKER_l1\model.pkl
	3.21s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l1\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\RandomForestClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trai

99:	learn: 0.3418427	total: 3.01s	remaining: 0us


Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_STACKER_l0\utils\oof.pkl
Fitting model: NeuralNetClassifier_FULL_STACKER_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_FULL_STACKER_l1\utils\model_template.pkl
AutoGluon Neural Network infers features are of the following types:
{
    "continuous": [
        "RandomForestClassifierGini_STACKER_l0",
        "RandomForestClassifierEntr_STACKER_l0",
        "ExtraTreesClassifierGini_STACKER_l0",
        "ExtraTreesClassifierEntr_STACKER_l0",
        "KNeighborsClassifierUnif_STACKER_l0",
        "KNeighborsClassifierDist_STACKER_l0",
        "LightGBMClassifier_S

[50]	train_set's binary_logloss: 0.36971
[100]	train_set's binary_logloss: 0.333349
[150]	train_set's binary_logloss: 0.321181


Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_FULL_STACKER_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_FULL_STACKER_l1\model.pkl
	3.55s	 = Training runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto1_k0_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto1_k0_l1\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_STACKER_l0\utils\oof.pkl
Fitting model: weighted_ensemble_FULL_k0_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\utils\model_template.pkl
Ensemble size: 61
Ensemble weights: 
[0.83606557 0.16393443]
Saving D:\Uni\2020.ml.kaggl

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l0\utils\oof.pkl
Fitting model: weighted_ensemble_FULL_k0_l1 ...
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\utils\model_template.pkl
Ensemble size: 57
Ensemble weights: 
[0.96491228 0.03508772]
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\model.pkl
	0.915	 = Validation roc_auc score
	8.04s	 = Training runtime
	0.07s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l1\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto8_k0_l1\model.pkl
Loading: D:\Uni\202

Ensemble weights: 
[0.48837209 0.3255814  0.18604651]
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\utils\oof.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\model.pkl
	0.9152	 = Validation roc_auc score
	18.32s	 = Training runtime
	0.07s	 = Validation runtime
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\trainer.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto13_k0_l2\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto13_k0_l2\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l1\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_FULL_k0_l2\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto18_k0_l2\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\weighted_ensemble_custom_pareto18_k0_l2\utils\model_template.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\CatboostClassifier_STACKER_l1\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_STACKER_l1\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\NeuralNetClassifier_STACKER_l0\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierXT_STACKER_l1\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifierCustom_STACKER_l1\utils\oof.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_mode

Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_FULL_STACKER_l0\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierGini_FULL_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_FULL_STACKER_l0\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\ExtraTreesClassifierEntr_FULL_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierUnif_FULL_STACKER_l0\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierUnif_FULL_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierDist_FULL_STACKER_l0\model.pkl
Saving D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\KNeighborsClassifierDist_FULL_STACKER_l0\model.pkl
Loading: D:\Uni\2020.ml.kaggle\Marc\\trained_models\models\LightGBMClassifier_FULL_STACKER_l0\model.pkl
Saving D:\Uni\2020.ml.ka

KeyError: 'NeuralNetClassifier_FULL_STACKER_l0'

## Create csv

In [55]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\test.csv"])
data = pd.read_csv(current_path + file)
data=data.set_index("Id")

# signal is the target variable
df_test=data.drop(["BUTTER"], axis=1) # Droping BUTTER variable as is IRRELEVANT

In [56]:
df_test.columns = df_test.columns.str.strip()
df_test2=preprocess_data(df_test)
df_test2.columns

Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV',
       ...
       'piminus_IP_OWNPV__X__piminus_P', 'gamma_PT__X__piminus_IP_OWNPV',
       'piminus_ETA__X__piminus_IP_OWNPV', 'Kplus_ETA__X__piminus_IP_OWNPV',
       'gamma_PT__X__piminus_P', 'piminus_ETA__X__piminus_P',
       'Kplus_ETA__X__piminus_P', 'gamma_PT__X__piminus_ETA',
       'Kplus_ETA__X__gamma_PT', 'Kplus_ETA__X__piminus_ETA'],
      dtype='object', length=105)

In [57]:
y_pred = predictor.predict_proba(test_data,model="NeuralNetClassifier_STACKER_l0")
y_pred

array([0.07074718, 0.5916555 , 0.99070597, ..., 0.8971243 , 0.06988458,
       0.26118276], dtype=float32)

In [58]:
results=pd.DataFrame(columns=["Predicted"],data=y_pred)
results.to_csv("AutoGluon_testSize010_randomState0.csv")

## Accuracy

In [23]:
y_test= test["signal"]
y_pred = predictor.predict(test)

In [24]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: roc_auc on test data: 0.8671892954929519
Evaluations on test data:
{
    "roc_auc": 0.8671892954929519,
    "accuracy_score": 0.8968253968253969,
    "balanced_accuracy_score": 0.8671892954929519,
    "matthews_corrcoef": 0.7656287191185643,
    "f1_score": 0.8968253968253969
}
Detailed (per-class) classification report:
{
    "0.0": {
        "precision": 0.8936925098554533,
        "recall": 0.9582922361561224,
        "f1-score": 0.9248657102060244,
        "support": 7097
    },
    "1.0": {
        "precision": 0.9045161290322581,
        "recall": 0.7760863548297814,
        "f1-score": 0.8353940116192463,
        "support": 3613
    },
    "accuracy": 0.8968253968253969,
    "macro avg": {
        "precision": 0.8991043194438557,
        "recall": 0.8671892954929519,
        "f1-score": 0.8801298609126353,
        "support": 10710
    },
    "weighted avg": {
        "precision": 0.8973438390884875,
        "recall": 0.8968253968253969,
        "f1-score": 0.89468258

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8968253968253969