# 04 - Model Building

In [83]:
train_dataset_size = 0.7 #% of the data that is used for training to speed up the process of finding the best model
should_build_model = False #If False, the model will be loaded from a file.
should_save_model = True #If True, the model will be saved to a file.

## Setup

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [85]:
DATASET = "df_processed.pkl"
SCORE_DATASET = "df_score_processed.pkl"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output', 'submissions']: makedirs(d)

## Import Data & Features

In [86]:
df = pd.read_pickle(ROOT+"data/"+DATASET)
df_score = pd.read_pickle(ROOT+"data/"+SCORE_DATASET)

dfs = [df, df_score]

print(df.shape)
df.head()

(209672, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
0,0.577321,0,-1.528407,0.574591,-0.371609,0,0.769142,1,0,-0.564334,-1.670166,1,0.30643,0.430979,0.331989,0.560323,48,1.064115,0,2.123352,0.331989,3,0.822184,-1.670166,-1.636594,0.342279,-1.105716,0.925453,0.790756,-3.02696,33,-1.315302,0.696806,-0.371609,9,14,-1.105716,-0.986619,0.94646,0.925453,4,0.296706,0,0.553993,3,2.884178,5,0,2.123352,1.721216,-1.509301,-0.050747,-2.072471,1,0.812313,4.148009,0.948791,1.0,0.37167,1.064115,0.646152,0.907293,4,0,-1.038661,0.368816,-2.072471,2.422425,1,0.37167,0,234,0.0,19,0,5,-0.036157,0.815055,0.184478,3,0.548773,0.674836,0,0.315166,-0.842786,-2.302796,-0.099039,0.68244,3,1.32449,2.017993,1.32449,1,0.553993,-0.254312,-4.850562,0.07747,4,-0.242058,0.616705,0.577321,1,-3.975136,0.187446,0.368816,0.961387,9,6,2.024803,1,62,0,0.790756,0.436108,-2.839741,1.752665,0.8825,0.749183,-0.289017,0.187446,2.024803,0.0,0.738336,0,42,2,1,0.696806,1.721216,23,0.57797,-2.237272,39,1,0.0,2.64866,-1.322781,1,1,0.856375,0.674836,0.646152,-3.975136,-1.685895,0.856375,2.884178,-0.842786,-0.050747,-0.099039,-4.850562,1,-1.038661,69,0.749183,0.342279,-1.509301,2.422425,3,33,0.702792,0.548773,0.701321,-0.444765,10,Class_2
1,0.413242,0,-0.072692,0.567419,0.206837,0,-0.164559,0,0,-0.956555,4.030491,1,0.279233,0.512899,0.771122,0.183065,48,1.282373,0,-2.110657,0.771122,3,0.281779,4.030491,4.146307,0.39322,4.386999,0.528352,0.255444,2.83781,33,-7.434869,0.703253,0.206837,12,10,4.386999,-4.222262,0.347965,0.528352,5,-0.98557,0,0.354723,3,2.755122,27,8,-2.110657,4.238453,-0.097172,7.098692,0.803323,1,0.47527,-5.056917,0.328198,1.0,0.423728,1.282373,0.255066,0.493516,9,0,-5.380026,0.636534,0.803323,2.666922,1,0.423728,0,14,0.0,2,0,5,3.308104,0.359323,0.555923,2,1.914999,0.275613,0,0.639203,-0.820514,-3.699545,1.796063,0.795767,3,0.325121,-1.564933,0.325121,1,0.354723,-0.752444,-3.079449,1.073057,2,-1.637676,0.547283,0.413242,0,0.04194,0.495975,0.636534,0.283092,10,12,-1.353885,0,57,0,0.255444,0.367446,-0.256603,-0.603172,0.317201,0.908436,0.347056,0.495975,-1.353885,0.0,0.250319,10,141,2,0,0.703253,4.238453,45,0.365215,2.986686,25,1,0.0,-3.908599,-0.592003,1,1,-0.575467,0.275613,0.255066,0.04194,-3.751033,-0.575467,2.755122,-0.820514,7.098692,1.796063,-3.079449,1,-5.380026,54,0.908436,0.39322,-0.097172,2.666922,42,33,0.661232,1.914999,0.934367,0.524791,5,Class_2
2,-0.992723,0,5.089105,0.344148,1.052436,0,-1.066716,0,0,-0.852081,-5.396598,0,0.349402,0.541646,1.252255,1.696027,48,2.988087,3,-3.721994,1.252255,1,0.298688,-5.396598,0.240343,0.701801,-3.333315,0.331816,0.372463,1.00268,33,-7.104154,0.607222,1.052436,10,14,-3.333315,-4.765734,0.369349,0.331816,7,-2.53848,0,0.680543,3,-0.756906,25,0,-3.721994,1.923408,-1.279555,-1.710899,0.534373,0,0.455945,-2.741101,0.462819,0.0,0.419382,2.988087,0.618782,0.471865,9,5,5.444011,0.304979,0.534373,-2.901062,1,0.419382,0,150,0.0,2,0,5,0.002195,0.736656,0.572382,1,0.277789,0.379108,0,0.468445,-0.690706,-0.935914,-0.295098,0.683867,3,2.455901,1.120227,2.455901,1,0.680543,1.870748,0.884827,0.514067,9,-1.663241,0.478182,-0.992723,0,1.42342,0.821683,0.304979,0.76215,2,6,-0.507181,1,57,0,0.372463,0.332684,-2.889505,3.301401,0.630255,0.573692,-0.919568,0.821683,-0.507181,0.0,0.466936,11,18,1,1,0.607222,1.923408,12,0.320719,0.106202,14,1,1.0,-3.103558,0.369791,1,0,-1.320967,0.379108,0.618782,1.42342,-2.608896,-1.320967,-0.756906,-0.690706,-1.710899,-0.295098,0.884827,1,5.444011,147,0.573692,0.701801,-1.279555,-2.901062,47,33,0.795673,0.277789,0.479354,0.829004,10,Class_1
3,0.37342,0,-4.808461,0.25438,-0.435051,0,1.314414,0,0,-0.450406,-6.630857,0,0.437356,0.803078,0.189292,-0.136253,48,0.887608,0,-0.374439,0.189292,1,0.51168,-6.630857,0.129764,0.946983,0.564304,0.329405,0.383862,-0.636269,33,-3.25519,0.596949,-0.435051,11,14,0.564304,-2.792848,0.221831,0.329405,3,-6.003766,0,0.122344,3,3.161273,5,9,-0.374439,4.803832,0.577315,-5.38227,-0.996981,1,0.202926,-2.469229,0.259328,0.0,0.355687,0.887608,0.634592,0.66319,9,2,1.69319,0.484986,-0.996981,-0.787452,1,0.355687,0,194,0.0,14,0,4,-3.770526,0.149882,0.936328,1,0.146524,0.435222,0,0.366162,-0.383025,-2.509186,-0.480305,0.202493,3,-0.89824,0.267738,-0.89824,1,0.122344,-1.532212,-5.342407,-0.637505,4,2.004994,0.486493,0.37342,0,-0.944536,0.261193,0.484986,0.285201,10,12,0.611588,1,62,0,0.383862,0.401932,-0.083454,2.130718,0.231694,0.299336,1.314217,0.261193,0.611588,0.0,0.301979,8,213,2,0,0.596949,4.803832,49,0.307328,-0.497761,44,2,0.0,0.305652,0.995479,1,0,-1.362534,0.435222,0.634592,-0.944536,0.004034,-1.362534,3.161273,-0.383025,-5.38227,-0.480305,-5.342407,1,1.69319,147,0.299336,0.946983,0.577315,-0.787452,33,33,0.712934,0.146524,0.473989,0.96382,10,Class_2
4,-0.69428,0,-3.944867,0.774119,-1.376659,0,-1.013648,1,0,0.970933,-5.991751,1,0.68871,0.516681,-0.378682,-0.144624,6,1.146451,0,-0.330101,-0.378682,1,0.591218,-5.991751,-1.675724,0.509486,3.870937,0.54053,0.430815,-2.246914,33,-8.806784,0.380222,-1.376659,14,14,3.870937,-2.12218,0.604214,0.54053,4,-0.261959,0,0.736003,3,5.771128,7,0,-0.330101,-2.526599,-3.136469,-3.095688,-0.892518,0,0.522699,-0.713184,0.446525,0.0,0.31392,1.146451,0.71696,0.480881,9,0,-0.740941,0.352894,-0.892518,-0.933163,1,0.31392,8,89,1.0,14,0,4,4.079695,0.359671,0.778319,0,0.595355,0.803919,0,0.891468,2.152445,-9.905907,2.497357,0.619482,1,-1.857226,1.317192,-1.857226,1,0.736003,-1.056155,-0.98809,0.662834,2,2.974653,0.493714,-0.69428,0,-1.135468,0.951759,0.352894,0.891473,9,6,0.028371,0,57,0,0.430815,0.68606,-0.369468,-2.051631,0.678317,0.60645,2.960185,0.951759,0.028371,0.0,0.830762,0,22,2,1,0.380222,-2.526599,12,0.731859,1.19854,11,1,0.0,-3.121829,-1.034903,1,1,-0.346098,0.803919,0.71696,-1.135468,-1.176002,-0.346098,5.771128,2.152445,-3.095688,2.497357,-0.98809,1,-0.740941,163,0.60645,0.509486,-3.136469,-0.933163,57,2,0.722723,0.595355,0.468482,0.998291,5,Class_2


In [87]:
with open(ROOT+"data/features.yaml") as file:
    yml_obj = yaml.load(file, Loader=yaml.FullLoader)

target = yml_obj["target"]
features = yml_obj["features"]
numerical_features = yml_obj["numerical_features"]
categorical_features = yml_obj["categorical_features"]

target_labels = sorted(df[target].unique())

print(f"Target: {target}")
print(f"Features: {features}")
print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Target Labels: {target_labels}")
print(f"Passed the sanity check: {len(numerical_features) + len(categorical_features) == len(features)}")

Target: Target
Features: ['x000', 'x001', 'x002', 'x003', 'x004', 'x005', 'x006', 'x007', 'x008', 'x009', 'x010', 'x011', 'x012', 'x013', 'x014', 'x015', 'x016', 'x017', 'x018', 'x019', 'x020', 'x021', 'x022', 'x023', 'x024', 'x025', 'x026', 'x027', 'x028', 'x029', 'x030', 'x031', 'x032', 'x033', 'x034', 'x035', 'x036', 'x037', 'x038', 'x039', 'x040', 'x041', 'x042', 'x043', 'x044', 'x045', 'x046', 'x047', 'x048', 'x049', 'x050', 'x051', 'x052', 'x053', 'x054', 'x055', 'x056', 'x057', 'x058', 'x059', 'x060', 'x061', 'x062', 'x063', 'x064', 'x065', 'x066', 'x067', 'x068', 'x069', 'x070', 'x071', 'x072', 'x073', 'x074', 'x075', 'x076', 'x077', 'x078', 'x079', 'x080', 'x081', 'x082', 'x083', 'x084', 'x085', 'x086', 'x087', 'x088', 'x089', 'x090', 'x091', 'x092', 'x093', 'x094', 'x095', 'x096', 'x097', 'x098', 'x099', 'x100', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x107', 'x108', 'x109', 'x110', 'x111', 'x112', 'x113', 'x114', 'x115', 'x116', 'x117', 'x118', 'x119', 'x120', 'x121'

## Preperation

### Imports

In [88]:
# Imports

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

### Methods

In [89]:
def submission(y_pred_prob, name=f"submission-{pd.to_datetime('now', utc=True).strftime('%Y%m%d%H%M%S')}"):
    df_sub = pd.DataFrame(y_pred_prob, columns=target_labels)
    df_sub.index.name = 'id'
    df_sub.to_csv(ROOT+f'output/{name}.csv', index=True)
    print(f"Saved ({df_sub.shape[0]} rows) to: {ROOT}output/{name}.csv")

### Data Splitting

Data is being split into training and testing sets. The training set will be used to train the model, while the testing set will be used to evaluate the model.

The following dataframes are created:
- `X_train`: Features of the training set
- `X_test`: Features of the testing set
- `y_train`: Target of the training set
- `y_test`: Target of the testing set

Stratify is used to ensure that the target distribution is the same in both the training and testing sets. (We saw a small imbalance in the target distribution in the EDA notebook)

In [90]:
df_train , df_test = train_test_split(df, test_size=1-train_dataset_size, random_state=SEED, stratify=df[target])

df_train.sort_index(inplace=True)
df_test.sort_index(inplace=True)

print(df_train.shape, df_test.shape, df_score.shape)

(146770, 165) (62902, 165) (90000, 164)


### Scaling

The features are scaled using the `StandardScaler` from `sklearn.preprocessing`

In [91]:
ss = StandardScaler()
ss.fit(df_train[numerical_features])

x_train_num = ss.transform(df_train[numerical_features])
x_test_num = ss.transform(df_test[numerical_features])
x_score_num = ss.transform(df_score[numerical_features])

### Encoding

- The target feature is encoded using the `LabelEncoder` from `sklearn.preprocessing`
- The categorical features are encoded using the `OneHotEncoder` from `sklearn.preprocessing`

In [92]:
le = LabelEncoder()
le.fit(df[target])

y_train = le.transform(df_train[target])
y_test = le.transform(df_test[target])

In [93]:
ohe = OneHotEncoder()
ohe.fit(df_train[categorical_features])

x_train_cat = ohe.transform(df_train[categorical_features])
x_test_cat = ohe.transform(df_test[categorical_features])
x_score_cat = ohe.transform(df_score[categorical_features])

### Merge Encoded and Scaled Features

In [94]:
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
column_names = numerical_features + list(ohe_feature_names)

x_train = pd.DataFrame(np.concatenate([x_train_num, x_train_cat.toarray()], axis=1), columns=column_names)
x_test = pd.DataFrame(np.concatenate([x_test_num, x_test_cat.toarray()], axis=1), columns=column_names)
x_score = pd.DataFrame(np.concatenate([x_score_num, x_score_cat.toarray()], axis=1), columns=column_names)

In [95]:
print(x_train.shape, x_test.shape, x_score.shape)

x_train.head()

(146770, 210) (62902, 210) (90000, 210)


Unnamed: 0,x000,x001,x002,x003,x004,x006,x009,x010,x012,x013,x014,x015,x016,x017,x019,x020,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x054,x055,x056,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x069,x070,x071,x073,x075,x076,x077,x078,x079,x080,x081,x083,x084,x085,x086,x087,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x102,x103,x104,x105,x106,x107,x108,x110,x112,x113,x114,x115,x116,x117,x118,x119,x120,x122,x123,x124,x127,x128,x129,x130,x131,x132,x135,x136,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,x005_0,x005_1,x005_2,x005_3,x007_0,x007_1,x008_0,x008_1,x011_0,x011_1,x018_0,x018_1,x018_2,x018_3,x021_0,x021_1,x021_2,x021_3,x053_0,x053_1,x057_0.0,x057_1.0,x068_0,x068_1,x068_2,x068_3,x072_0.0,x072_1.0,x072_2.0,x072_4.0,x074_0,x074_1,x082_0,x082_1,x082_2,x082_3,x088_0,x088_1,x088_2,x088_3,x101_0,x101_1,x109_0,x109_1,x111_0,x111_1,x111_2,x111_3,x121_0.0,x121_1.0,x125_0,x125_1,x125_2,x125_3,x126_0,x126_1,x133_0,x133_1,x133_2,x133_3,x134_0.0,x134_1.0,x137_0,x137_1,x138_0,x138_1,x150_0,x150_1,x150_2,x150_3
0,0.042872,-0.58312,-0.400982,0.536947,-0.372261,0.426217,-0.559965,-0.103054,-0.979858,-0.321422,0.331593,0.563381,0.44615,0.036221,0.937988,0.331102,1.374495,-0.103101,-0.572559,-0.722095,-0.488337,2.035537,1.419426,-1.607213,0.259882,-0.134652,0.843148,-0.371682,0.366085,0.643533,-0.488337,-0.056624,2.008644,2.038969,-0.531546,0.144128,-0.520944,0.212668,-0.305255,0.717429,-1.034122,-0.580772,0.93995,0.557853,-0.647653,0.198733,-2.062576,1.444731,1.253923,2.542888,-0.708314,0.036221,0.793591,1.946372,-2.546074,-0.634663,-0.064664,-0.665854,-2.062576,1.067703,-0.708314,-0.635177,0.780903,1.834379,0.130982,0.201459,1.38908,-1.434027,2.039174,0.548299,0.823022,-0.917457,-0.842968,-0.282526,-0.046349,0.75222,0.613345,1.027975,0.613345,-0.579478,0.212668,-0.256407,-1.226382,0.07745,-0.59931,0.146891,0.536241,0.042683,-1.763617,-1.346999,-0.665854,2.085832,-0.072773,-0.981914,1.004838,0.604023,1.419426,-0.273532,-1.254501,0.774298,1.429934,1.136642,0.10974,-1.346999,1.004838,1.025109,-2.643433,-0.902023,0.842518,0.557853,0.072587,0.598322,-0.960255,0.898148,1.175029,-1.324281,0.278,0.821001,0.791583,-1.763617,-0.226341,0.278,0.717429,-0.842968,0.198733,-0.046349,-1.226382,-0.064664,-0.986646,1.136642,-0.722095,-0.646503,1.067703,-1.931625,0.261596,0.959378,0.548195,0.796744,-0.45008,-0.012428,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.065169,-0.58312,-1.101321,-1.105959,-0.435965,0.727085,-0.446196,-1.221691,-0.370583,1.366207,0.188622,-0.133686,0.44615,-0.062519,-0.149203,0.188281,-0.063061,-1.219623,-0.076806,2.019204,0.249086,-0.846156,-0.577369,-0.51629,0.259882,-0.627974,0.403718,-0.435241,0.897157,0.643533,0.249086,-0.547203,-1.302648,-0.847326,-1.189861,-2.59956,-0.520944,-1.576156,-0.305255,0.804848,-1.034122,2.685306,-0.149654,1.966958,0.264832,-1.25084,-0.987444,-1.575091,-0.425117,-1.261059,-0.781587,-0.062519,0.740141,0.765875,-0.211246,-0.150928,0.630232,-0.01715,-0.987444,-0.348051,-0.781587,-0.635177,0.220645,1.022919,-0.528986,-0.813536,-1.522772,2.120343,0.332128,0.145982,-0.359218,-0.671444,-0.383819,-0.341453,-0.223384,-1.238674,-0.41826,0.13481,-0.41826,-0.579478,-1.576156,-1.537012,-1.364101,-0.636506,-0.59931,0.75843,-0.101351,-0.065191,-0.224309,-1.012074,-0.01715,-0.985331,0.323151,0.714794,0.303956,0.604023,-0.577369,-0.434107,-0.0369,0.941395,-1.403035,-1.106446,0.926247,-1.012074,0.303956,-0.892922,-0.040085,1.072851,0.403438,1.966958,1.640211,-0.77504,-0.203463,1.211307,0.138178,0.996478,-0.455436,-0.358327,0.738265,-0.224309,0.20201,-0.455436,0.804848,-0.383819,-1.25084,-0.223384,-1.364101,0.630232,0.076355,-1.106446,2.019204,0.263968,-0.348051,-0.004281,0.261596,1.008055,0.145945,-0.178163,0.957182,-0.012428,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.033614,-0.58312,2.008214,1.473937,-1.294541,-1.061123,-0.058814,1.600597,1.556501,-0.382715,-0.653107,-0.61584,-2.165779,1.277859,3.565019,-0.652561,-0.907291,1.59733,0.0439,0.806569,-0.175187,-0.237384,0.504114,-0.3182,0.259882,-1.467981,-0.697375,-1.291857,0.366085,0.643533,-0.175187,-2.344898,1.992227,-0.237581,-0.531546,1.561101,0.324888,0.486277,-0.305255,-0.979294,0.290944,-0.580772,3.572809,-0.195599,2.743529,0.308917,0.998429,1.472889,0.787516,2.814059,0.169869,1.277859,-0.741102,0.724119,1.18965,-0.634663,0.522799,-0.113369,0.998429,-1.297051,0.169869,2.994388,-0.479677,-0.924586,-0.528986,0.534521,0.534711,0.4005,0.332128,0.107898,-0.346665,-0.470543,1.194427,0.207023,-0.092209,0.747637,-0.384053,0.562697,-0.384053,-0.579478,0.486277,0.316674,2.095691,0.004084,-0.925853,-1.220343,-0.144329,-1.032135,-0.120642,0.481876,-0.113369,0.511383,-0.072773,-2.113053,2.063921,0.110264,0.504114,0.197447,-0.547282,0.958533,-0.436916,-1.111843,-1.59947,0.481876,2.063921,1.188798,-0.040085,0.322168,-0.696779,-0.195599,0.976985,0.646404,-0.181107,0.271831,-1.68717,-0.693255,-0.063874,-0.345805,-0.739346,-0.120642,-0.323355,-0.063874,-0.979294,1.194427,0.308917,-0.092209,2.095691,0.522799,1.561832,-1.111843,0.806569,2.737191,-1.297051,0.573923,1.209096,-1.326276,0.107868,-1.255242,1.196645,1.956801,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.053663,1.224024,-0.822626,0.455373,0.187317,0.14205,-1.084681,0.454397,-0.76569,-0.29674,0.226821,-0.454854,0.44615,-1.487311,0.057717,0.226441,-0.054797,0.453297,-1.625378,-0.409617,-0.015742,0.382928,0.141088,-0.389492,-2.166109,-1.299034,1.286472,0.186619,0.366085,-0.899562,-0.015742,-0.191281,-0.681421,0.383721,-0.531546,0.123114,-0.520944,-1.542543,-0.305255,0.008828,-0.907925,-0.580772,0.057725,-0.677275,-0.010538,-0.450455,-0.688827,-0.366446,1.341751,-0.330358,-0.155355,-1.487311,0.656737,-2.212636,-0.211246,-0.634663,-0.838622,2.700989,-0.688827,0.602368,-0.155355,-0.635177,-0.059484,0.211459,0.79095,-0.613437,0.532802,1.229285,0.332128,1.953594,-0.582846,-0.233469,1.196013,0.129293,0.492053,0.228239,0.680399,1.410524,0.680399,-0.579478,-1.542543,0.519336,1.013693,-0.696397,1.359948,0.755131,-0.943353,1.051907,-1.103734,-0.641936,2.700989,0.619963,0.323151,-0.981914,-0.01067,0.110264,0.141088,-0.864336,0.027706,0.643945,0.394783,-1.44737,0.292117,-0.641936,-0.01067,1.049162,-0.040085,-1.063709,1.285489,-0.677275,0.976985,-0.856793,-0.027318,-0.792908,0.395927,0.58903,1.291882,-0.581404,0.655065,-1.103734,-1.464136,1.291882,0.008828,1.196013,-0.450455,0.492053,1.013693,-0.838622,-0.904877,-1.44737,-0.409617,-0.010794,0.602368,-0.389749,1.209096,1.613551,1.953255,-1.375025,-0.571495,2.23812,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-1.879136,0.139738,-0.269457,1.069359,-1.285929,-1.126823,-0.517606,-1.407561,0.208281,-0.0494,1.099299,-1.147677,0.44615,1.240863,1.399248,1.098,-0.123309,-1.405143,1.2026,-0.8951,-1.07563,0.586522,1.194513,3.014972,0.259882,-0.934336,1.384726,-1.283265,-0.430522,-0.899562,-1.07563,1.115236,2.10054,0.58764,-0.531546,2.221295,-0.520944,1.144574,-0.963791,-0.052966,-1.034122,-0.580772,1.402233,-0.227146,-0.453953,0.301874,1.334477,1.656496,-2.81901,1.948528,2.1084,1.240863,0.382142,-0.025572,-0.211246,-0.634663,-0.705901,-0.677334,1.334477,0.691622,2.1084,-0.635177,-0.003458,1.022919,0.130982,-0.276758,1.03749,0.448718,0.332128,0.244567,-0.835137,-1.732239,0.602342,0.556271,-1.17965,-1.253498,1.111167,1.150086,1.111167,0.36159,1.144574,0.67225,-0.081719,0.378582,-0.925853,1.258981,1.008149,-1.876346,1.040337,-0.174109,-0.677334,0.229202,0.323151,0.714794,-1.096966,0.110264,1.194513,0.247575,-0.230833,-0.288231,1.694718,0.915651,0.848407,-0.174109,-1.096966,1.144301,-0.040085,0.934264,1.383665,-0.227146,1.700504,0.626884,1.634788,-0.229223,-1.156728,-1.168673,0.817502,-0.833073,0.381143,1.040337,0.183306,0.817502,-0.052966,0.602342,0.301874,-1.17965,-0.081719,-0.705901,2.106961,0.915651,-0.8951,-0.45323,0.691622,-1.931625,-2.186115,-0.407522,0.244514,1.322522,-0.337859,-0.012428,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


## Build Models

### Imports

May require XGBoost to be installed. Uncomment the following line to install it.
I used the following command to install XGBoost:
```bash
conda install xgboost
```

In [96]:
# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# pickle
import pickle

# metrics
from sklearn.metrics import log_loss, confusion_matrix, classification_report


In [97]:
classifiers = {}

In [98]:
def setup_model(name, classifier, x_train, y_train):
    if should_build_model:
        print(f"Training {name}...")
        start_time = pd.Timestamp.now()
        classifier.fit(x_train, y_train)
        end_time = pd.Timestamp.now()
        print(f"Training {name} took: {end_time - start_time}")

        if should_save_model:
            with open(ROOT+f"output/{name}.pkl", 'wb') as file:
                pickle.dump(classifier, file)
                print(f"Saved model to: {ROOT}output/{name}.pkl")
                
    else:
        with open(ROOT+f"output/{name}.pkl", 'rb') as file:
            classifier = pickle.load(file)
            print(f"Loaded model from: `{ROOT}output/{name}.pkl`")

    return classifier

In [99]:
def evaluate_model(name, classifier, x_test, y_test):
    # evaluate model
    # y_pred = classifier.predict(x_test)
    y_pred_prob = classifier.predict_proba(x_test)

    # print(f"Confusion Matrix for {name}:")
    # print(confusion_matrix(y_test, y_pred))
    # print(f"Classification Report for {name}:")
    # print(classification_report(y_test, y_pred, target_names=target_labels))
    print(f"Log Loss for {name}: {log_loss(y_test, y_pred_prob)}")

In [100]:
classifier_name = "Logistic_Regression"
classifiers.update({
    classifier_name : LogisticRegression(random_state=SEED, max_iter=1000, n_jobs=-1),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/Logistic_Regression.pkl`


In [101]:
classifier_name = "Random_Forest_(max_depth=10)"
classifiers.update({
    classifier_name : RandomForestClassifier(random_state=SEED, n_jobs=-1, max_depth=10),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/Random_Forest_(max_depth=10).pkl`


In [102]:
## TOOK TOO LONG TO TRAIN 14+ minutes

# classifier_name = "Gradient Boosting"
# classifiers.update({
#     classifier_name : GradientBoostingClassifier(random_state=SEED),
# })

# classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [103]:
## TOOK TOO LONG TO TRAIN 18+ minutes

# classifier_name = "Support Vector Machine"
# classifiers.update({
#     classifier_name : SVC(probability=True, random_state=SEED),
# })

# classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [104]:
classifier_name = "Neural_Network"
classifiers.update({
    classifier_name : MLPClassifier(random_state=SEED),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/Neural_Network.pkl`


In [105]:
classifier_name = "Naive_Bayes"
classifiers.update({
    classifier_name : GaussianNB(),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/Naive_Bayes.pkl`


In [106]:
classifier_name = "K-Nearest_Neighbors_(3)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=3),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/K-Nearest_Neighbors_(3).pkl`


In [107]:
classifier_name = "K-Nearest_Neighbors_(5)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=5),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/K-Nearest_Neighbors_(5).pkl`


In [108]:
classifier_name = "K-Nearest_Neighbors_(7)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=7),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/K-Nearest_Neighbors_(7).pkl`


In [109]:
classifier_name = "XGBoost"
classifiers.update({
    classifier_name : XGBClassifier(random_state=SEED),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

Loaded model from: `./output/XGBoost.pkl`


## Evaluate Models

In [110]:
def evaluate_model(name, classifier, x_test, y_test):
    y_pred_prob = classifier.predict_proba(x_test)
    log_loss_score = log_loss(y_test, y_pred_prob)
    print(f"Log Loss for {name}: {log_loss_score}")
    return log_loss_score

In [111]:
(best_performing_model, best_performing_model_score) = (None, np.inf)

for classifier_name, classifier in classifiers.items():
    score = evaluate_model(classifier_name, classifier, x_test, y_test)
    if score < best_performing_model_score:
        (best_performing_model, best_performing_model_score) = (classifier_name, score)

Log Loss for Logistic_Regression: 1.4304415042849623
Log Loss for Random_Forest_(max_depth=10): 1.3827452278686074
Log Loss for Neural_Network: 1.2325728655506123


Log Loss for Naive_Bayes: 3.3207248885613687
Log Loss for K-Nearest_Neighbors_(3): 15.938307833213829
Log Loss for K-Nearest_Neighbors_(5): 10.126555002534053
Log Loss for K-Nearest_Neighbors_(7): 6.848397781180832
Log Loss for XGBoost: 1.2005250587967873


In [112]:
print(f"Best performing model: {best_performing_model} with a log loss of {best_performing_model_score}")

Best performing model: XGBoost with a log loss of 1.2005250587967873


## Generate Submission File

In [113]:
target_labels = sorted(df[target].unique())
target_labels

['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5']

In [114]:
def submission(y_pred_prob, name=f"submission-{pd.to_datetime('now', utc=True).strftime('%Y%m%d%H%M%S')}"):
    df_sub = pd.DataFrame(y_pred_prob, columns=target_labels)
    df_sub.index.name = 'id'
    df_sub.to_csv(ROOT+f'submissions/{name}.csv', index=True)
    print(f"Saved ({df_sub.shape[0]} rows) to: {ROOT}submissions/{name}.csv")

In [115]:
best_performing_model = classifiers[best_performing_model]
y_score = best_performing_model.predict_proba(x_score)
submission(y_score)

Saved (90000 rows) to: ./submissions/submission-20240323213943.csv


# Notes