In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import seaborn as sns

from sklearn.preprocessing import Normalizer, Binarizer, MinMaxScaler, \
    QuantileTransformer, StandardScaler, KernelCenterer, RobustScaler
from sklearn.model_selection import train_test_split

import autogluon.core as ag
from autogluon import TabularPrediction as task

# Data Preparation

## Acquire Data

**Loading the data and remove butter**

In [53]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\train.csv"])
file_test= os.path.sep.join(["..\\data\\test.csv"])

df = pd.read_csv(file, index_col=0)
df_eval = pd.read_csv(file_test, index_col=0)

df.columns = df.columns.str.strip()
df_eval.columns = df_eval.columns.str.strip()

del df['BUTTER'] # WTF BUTTER!
del df_eval['BUTTER'] # WTF BUTTER!

print(df.columns)

Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV', 'piminus_P', 'gamma_PT', 'piminus_ETA',
       'Kplus_ETA', 'signal'],
      dtype='object')


**Feature extraction**. We create new variable by multplying the existing ones, doing the log and the exp (when possible)

In [54]:
def preprocess_data(df):
    df2 = df.copy()
    
# #     Not using EXP / LOG for now
    for col in df.columns:
        exp=np.exp(df[col])
        log=np.log(df[col])
        if col!="signal":
            if not (np.nan in exp) and not(np.abs(exp.max())>=1e12):
                df2['EXP_'+col] = exp
            if not (np.nan in log) and not(np.abs(log.max())>=1e12):
                df2['LOG_'+col] = log

    # TODO: idea add a new variable that is the P(signal) for that particular line
    # I.e. always P(signal=1)
    
#     for col in df.columns:
#         if col!="signal":
#             df[f"{col}__SQR"] = df[col] ** 2
    
    done = set()
    for col1 in df.columns:
        if col1!="signal":
            for col2 in df.columns:
                if col2!="signal":
                    colname = '__X__'.join(sorted((col1,col2)))
                    if col1 != col2 and colname not in done:
                        df2[colname] = df[col1] * df[col2]
                        done.add(colname)
    
    return df2

# AutoGluon Maxime accuracy

In [56]:
df2=preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(df2, df2.signal, test_size=0.10, random_state=0)

train_data = task.Dataset(X_train)

test_data = task.Dataset(X_test)
label_column="signal"
output_directory=current_path+ "\\trained_models"

In [57]:
#time_limits = 36000 # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
metric = 'roc_auc' # specify your evaluation metric here
#presets="medium_quality_faster_train" # produces less accurate models but facilitates faster prototyping
presets='best_quality'
predictor = task.fit(train_data=train_data, label=label_column,
                     eval_metric=metric, presets=presets,output_directory=output_directory,auto_stack=True)

Beginning AutoGluon training ...
AutoGluon will save models to D:\Uni\2020.ml.kaggle\Marc\\trained_models\
AutoGluon Version:  0.0.14
Train Data Rows:    191395
Train Data Columns: 127
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    31376.15 MB
	Train Data (Original)  Memory Usage: 195.99 MB (0.6% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenera

In [58]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetClassifier_STACKER_l0,0.92946,0.924122,9.591411,8.798091,10977.823747,9.591411,8.798091,10977.823747,0,True,10
1,weighted_ensemble_k0_l1,0.929071,0.924413,15.977232,13.882829,11996.301516,0.02167,0.059899,66.380295,1,True,12
2,weighted_ensemble_k0_l2,0.928924,0.924944,175.690434,70.985791,20776.351445,0.005024,0.060833,66.636158,2,True,24
3,CatboostClassifier_STACKER_l1,0.928867,0.924571,163.289506,60.172182,16412.37856,0.154584,0.106234,56.191855,1,True,21
4,LightGBMClassifierCustom_STACKER_l1,0.928742,0.924486,164.462438,61.009518,16476.738297,1.327515,0.94357,120.551592,1,True,23
5,LightGBMClassifierXT_STACKER_l1,0.928631,0.924481,163.945767,60.61497,16407.847849,0.810845,0.549022,51.661144,1,True,20
6,LightGBMClassifier_STACKER_l1,0.928584,0.92465,163.7867,60.533722,16400.69338,0.651778,0.467774,44.506675,1,True,19
7,NeuralNetClassifier_STACKER_l1,0.928572,0.922878,172.740687,68.85836,20436.804021,9.605765,8.792411,4080.617316,1,True,22
8,RandomForestClassifierEntr_STACKER_l1,0.921197,0.918306,175.596123,64.570962,18014.051199,12.461201,4.505014,1657.864494,1,True,14
9,RandomForestClassifierGini_STACKER_l1,0.918481,0.916197,177.803139,65.194052,17556.567126,14.668217,5.128104,1200.380421,1,True,13


## Create csv

In [59]:
# We only get the train data set
current_path = os.getcwd()+"\\"
file = os.path.sep.join(["..\\data\\test.csv"])
data = pd.read_csv(current_path + file)
data=data.set_index("Id")

# signal is the target variable
df_test=data.drop(["BUTTER"], axis=1) # Droping BUTTER variable as is IRRELEVANT

In [65]:
df_test.columns = df_test.columns.str.strip()
df_test2=preprocess_data(df_test)
df_test2.columns

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV',
       ...
       'piminus_IP_OWNPV__X__piminus_P', 'gamma_PT__X__piminus_IP_OWNPV',
       'piminus_ETA__X__piminus_IP_OWNPV', 'Kplus_ETA__X__piminus_IP_OWNPV',
       'gamma_PT__X__piminus_P', 'piminus_ETA__X__piminus_P',
       'Kplus_ETA__X__piminus_P', 'gamma_PT__X__piminus_ETA',
       'Kplus_ETA__X__gamma_PT', 'Kplus_ETA__X__piminus_ETA'],
      dtype='object', length=127)

In [66]:
y_pred = predictor.predict_proba(test_data,model="NeuralNetClassifier_STACKER_l0")
y_pred

array([0.01223109, 0.5490074 , 0.99228776, ..., 0.88809615, 0.0685072 ,
       0.25073302], dtype=float32)

In [67]:
results=pd.DataFrame(columns=["Predicted"],data=y_pred)
results.to_csv("AutoGluon_testSize010_randomState0_v2.csv")