In [1]:
import useful_functions
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import resample
import PandasSimpleImputer
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier

## Classifying the ICL dataset using feature generation and selection
The feature generation and selection for this were done in `feature-generation-and-selection-ICL-dataset.ipynb`. In this notebook I use this new dataset for training and testing the classifiers to see whether the feature generation and selection has enabled the classifiers to perform better. I do the following in this notebook:

1. Split the dataset into train and test
2. Convert the outputs in $[0, 100]$ to outputs in $\{0, 1\}$ (to make classification rather than regression possible)
3. Use the `median` strategy for data imputation
4. Scale the data using `StandardScaler` from sklearn
5. Initialize and then train all classifiers using optimised parameters previously found
6. Print the confusion matrix for each
7. Create, train, and test the voting classifier

In [2]:
X_train = pd.read_csv('generated-data/X-train-new2.csv')
X_test = pd.read_csv('generated-data/X-test-new2.csv')
y_train = pd.read_csv('generated-data/y-train.csv').drop(['Unnamed: 0'], axis=1)
y_test = pd.read_csv('generated-data/y-test.csv').drop(['Unnamed: 0'], axis=1)

In [3]:
X_train = useful_functions.clean_dataset(X_train, delete_missing_data=False).drop(['index', 'Unnamed: 0'], axis=1)
X_test = useful_functions.clean_dataset(X_test, delete_missing_data=False).drop(['index', 'Unnamed: 0'], axis=1)

In [4]:
def convert_y_to_class(y):
    class_case = [0 if i < 2.0 else 1 for i in y]
    return class_case

In [5]:
y_train_class = convert_y_to_class(y_train.to_numpy())
y_test_class = convert_y_to_class(y_test.to_numpy())

In [6]:
imp = PandasSimpleImputer.PandasSimpleImputer(strategy='median', missing_values=np.nan)
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
def train_and_run(clf):
    clf.fit(X_train, y_train_class)
    pred_clf = clf.predict(X_test)
    print(confusion_matrix(y_test_class, pred_clf))
    print(classification_report(y_test_class, pred_clf))

In [8]:
RF = RandomForestClassifier(n_estimators=100)
GBC = GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                 subsample=1, max_features='sqrt', random_state=10, learning_rate=0.15,
                                 n_estimators=500)
Ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=500)
XGBC = XGBClassifier(max_depth=7, min_child_weight=1, gamma=0.1, colsample_bytree=0.8, 
                                 subsample=0.6, reg_alpha=0, n_estimators=5000, learning_rate=0.01)
LGB = LGBMClassifier(subsample_freq=20, n_estimators=400, num_leaves=100, max_depth=20,
                                colsample_bytree=0.7, min_split_gain=0.3, reg_alpha=1.3, reg_lambda=1.3,
                                subsample=0.8)
MLP = MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes=(100,), learning_rate='adaptive',
                                solver='adam')

In [9]:
train_and_run(RF)

[[19084   270]
 [  329  1412]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     19354
           1       0.84      0.81      0.83      1741

    accuracy                           0.97     21095
   macro avg       0.91      0.90      0.90     21095
weighted avg       0.97      0.97      0.97     21095



In [10]:
train_and_run(GBC)

[[19073   281]
 [  221  1520]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19354
           1       0.84      0.87      0.86      1741

    accuracy                           0.98     21095
   macro avg       0.92      0.93      0.92     21095
weighted avg       0.98      0.98      0.98     21095



In [11]:
train_and_run(Ada)

[[19030   324]
 [  355  1386]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     19354
           1       0.81      0.80      0.80      1741

    accuracy                           0.97     21095
   macro avg       0.90      0.89      0.89     21095
weighted avg       0.97      0.97      0.97     21095



In [12]:
train_and_run(XGBC)





[[19084   270]
 [  198  1543]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19354
           1       0.85      0.89      0.87      1741

    accuracy                           0.98     21095
   macro avg       0.92      0.94      0.93     21095
weighted avg       0.98      0.98      0.98     21095



In [13]:
train_and_run(LGB)

[[19073   281]
 [  201  1540]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19354
           1       0.85      0.88      0.86      1741

    accuracy                           0.98     21095
   macro avg       0.92      0.94      0.93     21095
weighted avg       0.98      0.98      0.98     21095



In [14]:
train_and_run(MLP)

[[19031   323]
 [  219  1522]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     19354
           1       0.82      0.87      0.85      1741

    accuracy                           0.97     21095
   macro avg       0.91      0.93      0.92     21095
weighted avg       0.98      0.97      0.97     21095



In [24]:
def calc_votes(ys):
    outcomes = []
    for i in range(len(ys[0])):
        votes_positive = sum([y[i] for y in ys])
        results = 1 if votes_positive > len(ys) / 2 else 0
        outcomes.append(results)
    return outcomes
    
def get_voting_prediction(X):
    RF_pred = RF.predict(X)
    GBC_pred = GBC.predict(X)
    Ada_pred = Ada.predict(X)
    XGBC_pred = XGBC.predict(X)
    LGB_pred = LGB.predict(X)
    MLP_pred = MLP.predict(X)
    return calc_votes([RF_pred, GBC_pred, Ada_pred, XGBC_pred, LGB_pred, MLP_pred])

In [27]:
pred_voting = get_voting_prediction(X_test)
print(confusion_matrix(y_test_class, pred_voting))
print(classification_report(y_test_class, pred_voting))

[[19095   259]
 [  244  1497]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19354
           1       0.85      0.86      0.86      1741

    accuracy                           0.98     21095
   macro avg       0.92      0.92      0.92     21095
weighted avg       0.98      0.98      0.98     21095

