# 04 - Model Building

In [3]:
train_dataset_size = 0.4 #% of the data that is used for training to speed up the process of finding the best model
should_build_model = False #If False, the model will be loaded from a file.
should_save_model = True #If True, the model will be saved to a file.

## Setup

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

ModuleNotFoundError: No module named 'seaborn'

In [None]:
DATASET = "df_processed.pkl"
SCORE_DATASET = "df_score_processed.pkl"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import Data & Features

In [None]:
df = pd.read_pickle(ROOT+"data/"+DATASET)
df_score = pd.read_pickle(ROOT+"data/"+SCORE_DATASET)

dfs = [df, df_score]

print(df.shape)
df.head()

NameError: name 'pd' is not defined

In [None]:
with open(ROOT+"data/features.yaml") as file:
    yml_obj = yaml.load(file, Loader=yaml.FullLoader)

target = yml_obj["target"]
features = yml_obj["features"]
numerical_features = yml_obj["numerical_features"]
categorical_features = yml_obj["categorical_features"]

target_labels = sorted(df[target].unique())

print(f"Target: {target}")
print(f"Features: {features}")
print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Target Labels: {target_labels}")
print(f"Passed the sanity check: {len(numerical_features) + len(categorical_features) == len(features)}")

## Preperation

### Imports

In [None]:
# Imports

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

### Methods

In [None]:
def submission(y_pred_prob, name=f"submission-{pd.to_datetime('now', utc=True).strftime('%Y%m%d%H%M%S')}"):
    df_sub = pd.DataFrame(y_pred_prob, columns=target_labels)
    df_sub.index.name = 'id'
    df_sub.to_csv(ROOT+f'output/{name}.csv', index=True)
    print(f"Saved ({df_sub.shape[0]} rows) to: {ROOT}output/{name}.csv")

### Data Splitting

Data is being split into training and testing sets. The training set will be used to train the model, while the testing set will be used to evaluate the model.

The following dataframes are created:
- `X_train`: Features of the training set
- `X_test`: Features of the testing set
- `y_train`: Target of the training set
- `y_test`: Target of the testing set

Stratify is used to ensure that the target distribution is the same in both the training and testing sets. (We saw a small imbalance in the target distribution in the EDA notebook)

In [None]:
df_train , df_test = train_test_split(df, test_size=1-train_dataset_size, random_state=SEED, stratify=df[target])

df_train.sort_index(inplace=True)
df_test.sort_index(inplace=True)

print(df_train.shape, df_test.shape, df_score.shape)

### Scaling

The features are scaled using the `StandardScaler` from `sklearn.preprocessing`

In [None]:
ss = StandardScaler()
ss.fit(df_train[numerical_features])

x_train_num = ss.transform(df_train[numerical_features])
x_test_num = ss.transform(df_test[numerical_features])
x_score_num = ss.transform(df_score[numerical_features])

### Encoding

- The target feature is encoded using the `LabelEncoder` from `sklearn.preprocessing`
- The categorical features are encoded using the `OneHotEncoder` from `sklearn.preprocessing`

In [None]:
le = LabelEncoder()
le.fit(df[target])

y_train = le.transform(df_train[target])
y_test = le.transform(df_test[target])

In [None]:
ohe = OneHotEncoder()
ohe.fit(df_train[categorical_features])

x_train_cat = ohe.transform(df_train[categorical_features])
x_test_cat = ohe.transform(df_test[categorical_features])
x_score_cat = ohe.transform(df_score[categorical_features])

### Merge Encoded and Scaled Features

In [None]:
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
column_names = numerical_features + list(ohe_feature_names)

x_train = pd.DataFrame(np.concatenate([x_train_num, x_train_cat.toarray()], axis=1), columns=column_names)
x_test = pd.DataFrame(np.concatenate([x_test_num, x_test_cat.toarray()], axis=1), columns=column_names)
x_score = pd.DataFrame(np.concatenate([x_score_num, x_score_cat.toarray()], axis=1), columns=column_names)

In [None]:
print(x_train.shape, x_test.shape, x_score.shape)

x_train.head()

## Build Models

### Imports

May require XGBoost to be installed. Uncomment the following line to install it.
I used the following command to install XGBoost:
```bash
conda install xgboost
```

In [None]:
# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# pickle
import pickle

# metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


In [None]:
classifiers = {}

In [None]:
def setup_model(name, classifier, x_train, y_train):
    if should_build_model:
        print(f"Training {name}...")
        start_time = pd.Timestamp.now()
        classifier.fit(x_train, y_train)
        end_time = pd.Timestamp.now()
        print(f"Training {name} took: {end_time - start_time}")

        if should_save_model:
            with open(ROOT+f"output/{name}.pkl", 'wb') as file:
                pickle.dump(classifier, file)
                print(f"Saved model to: {ROOT}output/{name}.pkl")
                
    else:
        with open(ROOT+f"output/{name}.pkl", 'rb') as file:
            classifier = pickle.load(file)
            print(f"Loaded model from: `{ROOT}output/{name}.pkl`")

    return classifier

In [None]:
def evaluate_model(name, classifier, x_test, y_test):
    # evaluate model
    y_pred = classifier.predict(x_test)
    y_pred_prob = classifier.predict_proba(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr', average='weighted')

    print(f"Evaluation for {name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")
    print(f"ROC AUC: {roc_auc}")

In [None]:
classifier_name = "Logistic_Regression"
classifiers.update({
    classifier_name : LogisticRegression(random_state=SEED, max_iter=1000, n_jobs=-1),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "Random_Forest_(max_depth=10)"
classifiers.update({
    classifier_name : RandomForestClassifier(random_state=SEED, n_jobs=-1, max_depth=10),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
## TOOK TOO LONG TO TRAIN 14+ minutes

# classifier_name = "Gradient Boosting"
# classifiers.update({
#     classifier_name : GradientBoostingClassifier(random_state=SEED),
# })

# classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
## TOOK TOO LONG TO TRAIN 18+ minutes

# classifier_name = "Support Vector Machine"
# classifiers.update({
#     classifier_name : SVC(probability=True, random_state=SEED),
# })

# classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "Neural_Network"
classifiers.update({
    classifier_name : MLPClassifier(random_state=SEED),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "Naive_Bayes"
classifiers.update({
    classifier_name : GaussianNB(),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "K-Nearest_Neighbors_(3)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=3),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "K-Nearest_Neighbors_(5)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=5),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "K-Nearest_Neighbors_(7)"
classifiers.update({
    classifier_name : KNeighborsClassifier(n_jobs=-1, n_neighbors=7),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

In [None]:
classifier_name = "XGBoost"
classifiers.update({
    classifier_name : XGBClassifier(random_state=SEED),
})

classifiers[classifier_name] = setup_model(classifier_name, classifiers[classifier_name], x_train, y_train)

## Evaluate Models

In [None]:
for classifier_name, classifier in classifiers.items():
    evaluate_model(classifier_name, classifier, x_test, y_test)
    print()

## Generate Submission File

# Notes