In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from sklearn.ensemble import VotingClassifier

import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.25, random_state=1) 

In [5]:
X_train_origin.shape,y_train_origin.shape,X_train.shape, X_val.shape, y_train.shape, y_val.shape

((12096, 52), (12096,), (9072, 52), (3024, 52), (9072,), (3024,))

In [6]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    
# Some algorithms (such as SGD classifiers, Random Forest classifiers, and naive Bayes classifiers)
# are capable of handling multiple classes natively. 
# Others (such as Logistic Regression or Support Vector Machine classifiers) are strictly binary classifiers.

### Baseline

In [7]:
modeling(KNN)
# accept

Confusion_Matrix:
[[253 105   1   0  18   3  40]
 [103 227  14   0  56  22  11]
 [  0   2 281  46  20  90   1]
 [  0   0  12 404   0   9   0]
 [ 14  26   7   0 369   8   0]
 [  3   5  60  21  15 334   0]
 [ 31   6   0   0   1   1 405]]
              precision    recall  f1-score   support

           1       0.63      0.60      0.61       420
           2       0.61      0.52      0.56       433
           3       0.75      0.64      0.69       440
           4       0.86      0.95      0.90       425
           5       0.77      0.87      0.82       424
           6       0.72      0.76      0.74       438
           7       0.89      0.91      0.90       444

    accuracy                           0.75      3024
   macro avg       0.75      0.75      0.75      3024
weighted avg       0.75      0.75      0.75      3024

Accuracy:0.7516534391534392


In [8]:
lr = LR(multi_class='multinomial', solver='newton-cg') # one-vs-one, C(7,2) models will be trained, 
                                                        # then vote by activition:softmax
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# ovo is better than ovr
# reject

Confusion_Matrix:
[[274  83   0   0  20   0  43]
 [108 224   5   0  76  15   5]
 [  0   4 253  46  14 123   0]
 [  0   0  36 368   0  21   0]
 [  6  60   4   0 331  23   0]
 [  0   4 108  31  18 277   0]
 [ 60   0   0   0   1   0 383]]
              precision    recall  f1-score   support

           1       0.61      0.65      0.63       420
           2       0.60      0.52      0.55       433
           3       0.62      0.57      0.60       440
           4       0.83      0.87      0.85       425
           5       0.72      0.78      0.75       424
           6       0.60      0.63      0.62       438
           7       0.89      0.86      0.88       444

    accuracy                           0.70      3024
   macro avg       0.70      0.70      0.70      3024
weighted avg       0.70      0.70      0.70      3024

Accuracy:0.6977513227513228


In [9]:
lr2 = LR(multi_class='ovr', solver='newton-cg') # one-vs-rest, 7 models will be trained
                                                # activition:sigmoid
lr2.fit(X_train, y_train)
y_pred = lr2.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# reject

Confusion_Matrix:
[[258  84   1   0  29   1  47]
 [106 203   5   0  96  19   4]
 [  0  10 247  43  15 125   0]
 [  0   0  27 379   0  19   0]
 [ 12  59  21   0 306  26   0]
 [  1   7 103  38  47 242   0]
 [ 67   5   1   0   0   0 371]]
              precision    recall  f1-score   support

           1       0.58      0.61      0.60       420
           2       0.55      0.47      0.51       433
           3       0.61      0.56      0.58       440
           4       0.82      0.89      0.86       425
           5       0.62      0.72      0.67       424
           6       0.56      0.55      0.56       438
           7       0.88      0.84      0.86       444

    accuracy                           0.66      3024
   macro avg       0.66      0.66      0.66      3024
weighted avg       0.66      0.66      0.66      3024

Accuracy:0.6633597883597884


In [10]:
svm = SVC(decision_function_shape='ovo') 
# one-vs-one, C(7,2) models will be trained, then vote, 
# ovo is preferable for SVM                                 
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# reject

Confusion_Matrix:
[[277  94   0   0  14   1  34]
 [ 97 257   9   0  50  15   5]
 [  0   2 289  40   9 100   0]
 [  0   0   9 406   0  10   0]
 [  6  31   6   0 368  13   0]
 [  0   6  66  17   5 344   0]
 [ 46   0   0   0   0   0 398]]
              precision    recall  f1-score   support

           1       0.65      0.66      0.65       420
           2       0.66      0.59      0.62       433
           3       0.76      0.66      0.71       440
           4       0.88      0.96      0.91       425
           5       0.83      0.87      0.85       424
           6       0.71      0.79      0.75       438
           7       0.91      0.90      0.90       444

    accuracy                           0.77      3024
   macro avg       0.77      0.77      0.77      3024
weighted avg       0.77      0.77      0.77      3024

Accuracy:0.7734788359788359


In [11]:
svm2 = SVC(decision_function_shape='ovr') # one-vs-rest, 7 models will be trained                                             
svm2.fit(X_train, y_train)
y_pred = svm2.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# same result as ovo
# reject

Confusion_Matrix:
[[277  94   0   0  14   1  34]
 [ 97 257   9   0  50  15   5]
 [  0   2 289  40   9 100   0]
 [  0   0   9 406   0  10   0]
 [  6  31   6   0 368  13   0]
 [  0   6  66  17   5 344   0]
 [ 46   0   0   0   0   0 398]]
              precision    recall  f1-score   support

           1       0.65      0.66      0.65       420
           2       0.66      0.59      0.62       433
           3       0.76      0.66      0.71       440
           4       0.88      0.96      0.91       425
           5       0.83      0.87      0.85       424
           6       0.71      0.79      0.75       438
           7       0.91      0.90      0.90       444

    accuracy                           0.77      3024
   macro avg       0.77      0.77      0.77      3024
weighted avg       0.77      0.77      0.77      3024

Accuracy:0.7734788359788359


In [12]:
modeling(NB) # reject

Confusion_Matrix:
[[ 61   1   2   0 125   7 224]
 [ 62   2  28   4 200  11 126]
 [  0   0 181 258   1   0   0]
 [  0   0   0 425   0   0   0]
 [  0   0  92   0 313   8  11]
 [  0   0 128 256  23  28   3]
 [  4   1   1   0  10   0 428]]
              precision    recall  f1-score   support

           1       0.48      0.15      0.22       420
           2       0.50      0.00      0.01       433
           3       0.42      0.41      0.42       440
           4       0.45      1.00      0.62       425
           5       0.47      0.74      0.57       424
           6       0.52      0.06      0.11       438
           7       0.54      0.96      0.69       444

    accuracy                           0.48      3024
   macro avg       0.48      0.48      0.38      3024
weighted avg       0.48      0.48      0.38      3024

Accuracy:0.4755291005291005


In [13]:
modeling(RF) # accept # worse than 1st

Confusion_Matrix:
[[319  66   0   0  10   0  25]
 [ 85 284   6   0  44  11   3]
 [  0   3 336  24   6  71   0]
 [  0   0   7 413   0   5   0]
 [  1  17   3   0 394   9   0]
 [  0   2  46  15   5 370   0]
 [ 24   0   0   0   0   0 420]]
              precision    recall  f1-score   support

           1       0.74      0.76      0.75       420
           2       0.76      0.66      0.71       433
           3       0.84      0.76      0.80       440
           4       0.91      0.97      0.94       425
           5       0.86      0.93      0.89       424
           6       0.79      0.84      0.82       438
           7       0.94      0.95      0.94       444

    accuracy                           0.84      3024
   macro avg       0.84      0.84      0.84      3024
weighted avg       0.84      0.84      0.84      3024

Accuracy:0.8386243386243386


In [14]:
modeling(ABC) # reject

Confusion_Matrix:
[[ 89   6   0   0 200   0 125]
 [ 36  21   1   0 307  41  27]
 [  0   1  18   0  22 399   0]
 [  0   0   0   0   0 425   0]
 [  0   9   0   0 370  45   0]
 [  0   0  13   0  30 395   0]
 [ 34   1   0   0   0   0 409]]
              precision    recall  f1-score   support

           1       0.56      0.21      0.31       420
           2       0.55      0.05      0.09       433
           3       0.56      0.04      0.08       440
           4       0.00      0.00      0.00       425
           5       0.40      0.87      0.55       424
           6       0.30      0.90      0.45       438
           7       0.73      0.92      0.81       444

    accuracy                           0.43      3024
   macro avg       0.44      0.43      0.33      3024
weighted avg       0.45      0.43      0.33      3024

Accuracy:0.4305555555555556


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
modeling(GBC) # reject # worse than 1st

Confusion_Matrix:
[[293  68   0   0  16   0  43]
 [111 245   6   0  56  13   2]
 [  0   1 294  29  15 101   0]
 [  0   0  10 406   0   9   0]
 [  3  24   9   0 373  15   0]
 [  0   6  69  14  18 331   0]
 [ 29   0   0   0   0   0 415]]
              precision    recall  f1-score   support

           1       0.67      0.70      0.68       420
           2       0.71      0.57      0.63       433
           3       0.76      0.67      0.71       440
           4       0.90      0.96      0.93       425
           5       0.78      0.88      0.83       424
           6       0.71      0.76      0.73       438
           7       0.90      0.93      0.92       444

    accuracy                           0.78      3024
   macro avg       0.78      0.78      0.78      3024
weighted avg       0.78      0.78      0.78      3024

Accuracy:0.779431216931217


In [16]:
modeling(XGBC) # reject # better than 1st

Confusion_Matrix:
[[279  56   0   0  27   0  58]
 [120 199   5   0  90  14   5]
 [  0   0 270  36  21 113   0]
 [  0   0   9 411   0   5   0]
 [  2  23   7   0 374  18   0]
 [  0   0  99  24  17 298   0]
 [ 35   0   0   0   0   0 409]]
              precision    recall  f1-score   support

           1       0.64      0.66      0.65       420
           2       0.72      0.46      0.56       433
           3       0.69      0.61      0.65       440
           4       0.87      0.97      0.92       425
           5       0.71      0.88      0.78       424
           6       0.67      0.68      0.67       438
           7       0.87      0.92      0.89       444

    accuracy                           0.74      3024
   macro avg       0.74      0.74      0.73      3024
weighted avg       0.74      0.74      0.73      3024

Accuracy:0.7407407407407407


In [17]:
modeling(LGBC) # reject # better than 1st

Confusion_Matrix:
[[302  74   0   0  12   0  32]
 [ 95 271   9   0  44  11   3]
 [  0   1 336  15   7  81   0]
 [  0   0   7 412   0   6   0]
 [  1  19   2   0 392  10   0]
 [  2   6  38  12   5 375   0]
 [ 16   1   0   0   0   0 427]]
              precision    recall  f1-score   support

           1       0.73      0.72      0.72       420
           2       0.73      0.63      0.67       433
           3       0.86      0.76      0.81       440
           4       0.94      0.97      0.95       425
           5       0.85      0.92      0.89       424
           6       0.78      0.86      0.81       438
           7       0.92      0.96      0.94       444

    accuracy                           0.83      3024
   macro avg       0.83      0.83      0.83      3024
weighted avg       0.83      0.83      0.83      3024

Accuracy:0.8316798941798942


In [None]:
# conclusion: use 1st iteration