# 2. Model Building

We have a dataset ready to be worked with. Using the pickle we created, we will now try with several different predicting models to see which ones gives us the best results.

In [63]:
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import re
import math
import os
import pickle
import pprint
from IPython.display import display, Markdown, Latex
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import *
import warnings
from sklearn import preprocessing
from sklearn import utils


#models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# ensemble models
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [107]:
with open('data/train_test.pickle', 'rb') as file:
    dataset_list = pickle.load(file)
    
X_train = dataset_list[0]
X_test = dataset_list[1]
y_train = dataset_list[2]
y_test = dataset_list[3]

The models we will be trying are:
- Logistic Regression
- K-Neighbors Classifier
- Decision Tree Classifier
- XGBoost
- Random Forest Classifier

In [106]:
num_folds = 10
seed = 1
scoring = 'roc_auc'
models = []
# basic models
models.append(('Logistic Regression', LogisticRegression()))
models.append(('K-Neighbors Classifier', KNeighborsClassifier()))
models.append(('Decision Tree Classifier', DecisionTreeClassifier()))

# ensemble models
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))
models.append(('Random Forest', RandomForestClassifier()))

# KFolds for model selection:
table_results = []
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    for name, model in tqdm(models):
        kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        table_results.append([name,cv_results.mean(),cv_results.std()])

pd.DataFrame(table_results, columns=['Model', 'Mean', 'Std'])

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:57<00:00, 11.60s/it]


Unnamed: 0,Model,Mean,Std
0,Logistic Regression,0.951391,0.003934
1,K-Neighbors Classifier,0.941496,0.00422
2,Decision Tree Classifier,0.864777,0.006017
3,XGBoost,0.959365,0.003124
4,Random Forest,0.956486,0.003671


We see that we get really good results for most of the models, except for the decision tree classifier, that only gives around 86% accuracy (the next best one is already at 94% accuracy). For this reason we will not be further using the Decision Tree Classifier.

In [108]:
- Feature importance 
- K best features

SyntaxError: invalid syntax (<ipython-input-108-0dcc0c0eb351>, line 1)

In [109]:
table_results

[['Logistic Regression', 0.9513914698762322, 0.003934021878711469],
 ['K-Neighbors Classifier', 0.941495727288556, 0.0042195239071342976],
 ['Decision Tree Classifier', 0.864777195526312, 0.006017226894459743],
 ['XGBoost', 0.9593654013971765, 0.00312391116321101],
 ['Random Forest', 0.956486081313707, 0.00367120916573642]]

In [111]:
from tabulate import tabulate
print(tabulate(table_results, headers=['Model', 'Mean', 'Std'], tablefmt='orgtbl'))

| Model                    |     Mean |        Std |
|--------------------------+----------+------------|
| Logistic Regression      | 0.951391 | 0.00393402 |
| K-Neighbors Classifier   | 0.941496 | 0.00421952 |
| Decision Tree Classifier | 0.864777 | 0.00601723 |
| XGBoost                  | 0.959365 | 0.00312391 |
| Random Forest            | 0.956486 | 0.00367121 |


| Model                    |     Mean |        Std |
|--------------------------|----------|------------|
| Logistic Regression      | 0.951391 | 0.00393402 |
| K-Neighbors Classifier   | 0.941496 | 0.00421952 |
| Decision Tree Classifier | 0.864777 | 0.00601723 |
| XGBoost                  | 0.959365 | 0.00312391 |
| Random Forest            | 0.956486 | 0.00367121 |