In [1]:
'''Main'''
import numpy as np
import pandas as pd
import os

'''Data Viz'''
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import matplotlib as mpl

%matplotlib inline

'''Data Prep'''
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.compose import ColumnTransformer 
from scipy.stats import pearsonr 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import precision_recall_curve, average_precision_score,precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report 

'''Algos'''
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import autogluon.core as ag
from autogluon import TabularPrediction as task
#import xgboost as xgb
#import lightgbm as lgb

# Data Preparation

## Acquire Data

In [2]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\train.csv"])
data = pd.read_csv(current_path + file)
data=data.set_index("Id")

# signal is the target variable
df=data.drop(["BUTTER"], axis=1) # Droping BUTTER variable as is IRRELEVANT

In [3]:
msk = np.random.rand(len(df)) < 0.99

train = df[msk]

test = df[~msk]

# AutoGluon Inital

In [52]:
train_data = task.Dataset(train)
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)

In [60]:
label_column = 'signal'
print("Summary of class variable: \n", train_data[label_column].describe())

Summary of class variable: 
 count    210568.000000
mean          0.334001
std           0.471641
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: signal, dtype: float64


In [30]:
dir = current_path+"\\trained_models"
predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir)

Beginning AutoGluon training ...
AutoGluon will save models to C:\Users\garciagr\OneDrive - HP Inc\DSUB\MachineLearning\Boson\2020.ml.kaggle\Marc\\trained_models\
AutoGluon Version:  0.0.14
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    39187.19 MB
	Train Data (Original)  Memory Usage: 0.06 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.


In [32]:
test_data = task.Dataset(test)
y_test = test_data[label_column]  # values to predict
test_data_nolab = test_data.drop(labels=[label_column],axis=1)  # delete label column to prove we're not cheating

In [39]:
predictor = task.load(dir)  # unnecessary, just demonstrates how to load previously-trained predictor from file

y_pred = predictor.predict(test_data_nolab)
print("Predictions:  ", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.7050730063646574
Evaluations on test data:
{
    "accuracy": 0.7050730063646574,
    "accuracy_score": 0.7050730063646574,
    "balanced_accuracy_score": 0.5768322523659386,
    "matthews_corrcoef": 0.2510302944295476,
    "f1_score": 0.7050730063646574
}
Detailed (per-class) classification report:
{
    "0.0": {
        "precision": 0.7050793323717288,
        "recall": 0.9595485137408861,
        "f1-score": 0.812863760541632,
        "support": 14264
    },
    "1.0": {
        "precision": 0.7050102249488752,
        "recall": 0.194115990990991,
        "f1-score": 0.30441501103752766,
        "support": 7104
    },
    "accuracy": 0.7050730063646574,
    "macro avg": {
        "precision": 0.705044778660302,
        "recall": 0.5768322523659386,
        "f1-score": 0.5586393857895798,
        "support": 21368
    },
    "weighted avg": {
        "precision": 0.7050563569350033,
        "recall": 0.7050730063646574,
        "f1-score": 0.6438250

Predictions:   [0 1 1 ... 0 0 0]


In [35]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesClassifierEntr,0.739985,0.71,0.224589,0.105948,0.302738,0.224589,0.105948,0.302738,0,True,4
1,ExtraTreesClassifierGini,0.739798,0.71,0.226053,0.105911,0.345051,0.226053,0.105911,0.345051,0,True,3
2,LightGBMClassifierXT,0.739657,0.74,0.049918,0.003709,0.274369,0.049918,0.003709,0.274369,0,True,8
3,RandomForestClassifierGini,0.738628,0.7,0.114872,0.105667,0.337162,0.114872,0.105667,0.337162,0,True,1
4,RandomForestClassifierEntr,0.737973,0.69,0.123669,0.105312,0.327921,0.123669,0.105312,0.327921,0,True,2
5,CatboostClassifier,0.737364,0.72,0.008056,0.003978,0.9158,0.008056,0.003978,0.9158,0,True,9
6,NeuralNetClassifier,0.718925,0.72,0.524147,0.012079,5.260593,0.524147,0.012079,5.260593,0,True,10
7,LightGBMClassifierCustom,0.70994,0.73,0.015163,0.002543,0.3542,0.015163,0.002543,0.3542,0,True,11
8,weighted_ensemble_k0_l1,0.705073,0.77,0.024959,0.006476,0.883696,0.003812,0.0,0.272763,1,True,12
9,LightGBMClassifier,0.704558,0.76,0.005984,0.003932,0.256732,0.005984,0.003932,0.256732,0,True,7


# AutoGluon Maxime accuracy

In [15]:
train_data = task.Dataset(train)
test_data = task.Dataset(test)
label_column="signal"
output_directory=current_path+ "\\trained_models"

In [16]:
time_limits = 1800 # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
metric = 'roc_auc' # specify your evaluation metric here
#presets="medium_quality_faster_train" # roduces less accurate models but facilitates faster prototyping
presets='best_quality'
predictor = task.fit(train_data=train_data, label=label_column, time_limits=time_limits,
                     eval_metric=metric, presets=presets,output_directory=output_directory)

Beginning AutoGluon training ... Time limit = 4500s
AutoGluon will save models to C:\Users\garciagr\OneDrive - HP Inc\DSUB\MachineLearning\Boson\2020.ml.kaggle\Marc\\trained_models\
AutoGluon Version:  0.0.14
Train Data Rows:    210568
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    35718.37 MB
	Train Data (Original)  Memory Usage: 25.27 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special d

[Errno 28] No space left on device
Fitting model: LightGBMClassifier_STACKER_l0 ... Training model for up to 639.08s of the 2888.77s of remaining time.
	0.8679	 = Validation roc_auc score
	46.83s	 = Training runtime
	1.78s	 = Validation runtime
Fitting model: LightGBMClassifierXT_STACKER_l0 ... Training model for up to 589.57s of the 2839.26s of remaining time.


[1000]	train_set's binary_logloss: 0.379839	valid_set's binary_logloss: 0.416024
[1000]	train_set's binary_logloss: 0.379057	valid_set's binary_logloss: 0.423532
[1000]	train_set's binary_logloss: 0.379528	valid_set's binary_logloss: 0.417542


	0.8704	 = Validation roc_auc score
	79.53s	 = Training runtime
	3.33s	 = Validation runtime
Fitting model: CatboostClassifier_STACKER_l0 ... Training model for up to 505.37s of the 2755.06s of remaining time.
	0.8702	 = Validation roc_auc score
	293.18s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: NeuralNetClassifier_STACKER_l0 ... Training model for up to 211.81s of the 2461.5s of remaining time.
	Ran out of time, stopping training early.
	Time limit exceeded... Skipping NeuralNetClassifier_STACKER_l0.
Fitting model: LightGBMClassifierCustom_STACKER_l0 ... Training model for up to 188.82s of the 2438.51s of remaining time.
	0.8685	 = Validation roc_auc score
	107.87s	 = Training runtime
	4.27s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: weighted_ensemble_k0_l1 ... Training model for up to 360.0s of the 2322.9s of remaining time.
	0.8719	 = Validation roc_auc score
	55.87s	 = Training runtime
	0.07s	 = Validation runtime
Fitting mo

[Errno 28] No space left on device
Fitting model: KNeighborsClassifierDist_STACKER_l1 ... Training model for up to 526.61s of the 526.56s of remaining time.
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 856, in _train_and_save
    model = self._train_single(X_train, y_train, model, X_val, y_val, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 825, in _train_single
    model.fit(X=X_train, y=y_train, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\abstract\abstract_model.py", line 233, in fit
    self._fit(**kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\ensemble\stacker_ensemble_model.py", line 129, in _fit
    super()._fit(X=X, y=y, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_

[Errno 28] No space left on device
Fitting model: LightGBMClassifierXT_STACKER_l1 ... Training model for up to 522.36s of the 522.31s of remaining time.
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 856, in _train_and_save
    model = self._train_single(X_train, y_train, model, X_val, y_val, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 825, in _train_single
    model.fit(X=X_train, y=y_train, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\abstract\abstract_model.py", line 233, in fit
    self._fit(**kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\ensemble\stacker_ensemble_model.py", line 129, in _fit
    super()._fit(X=X, y=y, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repe

[Errno 28] No space left on device
Fitting model: LightGBMClassifierCustom_STACKER_l1 ... Training model for up to 519.71s of the 519.66s of remaining time.
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 856, in _train_and_save
    model = self._train_single(X_train, y_train, model, X_val, y_val, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\trainer\abstract_trainer.py", line 825, in _train_single
    model.fit(X=X_train, y=y_train, **model_fit_kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\abstract\abstract_model.py", line 233, in fit
    self._fit(**kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\autogluon\utils\tabular\ml\models\ensemble\stacker_ensemble_model.py", line 129, in _fit
    super()._fit(X=X, y=y, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_

OSError: [Errno 28] No space left on device

In [None]:
predictor.leaderboard(test_data, silent=True)

## Create csv

In [10]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\test.csv"])
data = pd.read_csv(current_path + file)
data=data.set_index("Id")

# signal is the target variable
df_test=data.drop(["BUTTER"], axis=1) # Droping BUTTER variable as is IRRELEVANT

In [13]:
y_pred = predictor.predict_proba(df_test)
y_pred

array([0.09712674, 0.01428145, 0.10178402, ..., 0.8368763 , 0.66847175,
       0.04810874], dtype=float32)

In [14]:
results=pd.DataFrame(columns=["Predicted"],data=y_pred)
results.to_csv("WinResults2.csv")