In [1]:
import pandas as pd
import numpy as np
import sys, os

sys.path.append(os.getcwd() + "/../")

from src.utils import generate_lda

# Train test splitting and performance
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier

SEED = 100
train_df, lda = generate_lda(seed=SEED)
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['target']), train_df['target'], stratify=train_df['target'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/heinrikchoong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/heinrikchoong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
lr = LogisticRegression(random_state=SEED)
lr.fit(X=X_train, y=y_train)
y_pred = lr.predict(X_test)
print(" ----- Logistic Regression -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- Logistic Regression -----
              precision    recall  f1-score   support

           0       0.63      0.87      0.73      1086
           1       0.65      0.32      0.43       818

    accuracy                           0.63      1904
   macro avg       0.64      0.59      0.58      1904
weighted avg       0.64      0.63      0.60      1904



In [3]:
lgbm = LGBMClassifier(verbosity=-1, random_state=SEED)
lgbm.fit(X=X_train, y=y_train)
y_pred = lgbm.predict(X_test)
print(" ----- LightGBM -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- LightGBM -----
              precision    recall  f1-score   support

           0       0.68      0.82      0.75      1086
           1       0.68      0.49      0.57       818

    accuracy                           0.68      1904
   macro avg       0.68      0.66      0.66      1904
weighted avg       0.68      0.68      0.67      1904



In [4]:
xgb = XGBClassifier(random_state=SEED)
xgb.fit(X=X_train, y=y_train)
y_pred = xgb.predict(X_test)
print(" ----- XGBoost -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- XGBoost -----
              precision    recall  f1-score   support

           0       0.69      0.79      0.74      1086
           1       0.65      0.53      0.59       818

    accuracy                           0.68      1904
   macro avg       0.67      0.66      0.66      1904
weighted avg       0.68      0.68      0.67      1904



In [5]:
ada = AdaBoostClassifier(random_state=SEED)
ada.fit(X=X_train, y=y_train)
y_pred = ada.predict(X_test)
print(" ----- AdaBoost -----")
print(classification_report(y_true=y_test, y_pred=y_pred))



 ----- AdaBoost -----
              precision    recall  f1-score   support

           0       0.66      0.82      0.73      1086
           1       0.64      0.44      0.53       818

    accuracy                           0.66      1904
   macro avg       0.65      0.63      0.63      1904
weighted avg       0.65      0.66      0.64      1904



In [6]:
rf = RandomForestClassifier(random_state=SEED)
rf.fit(X=X_train, y=y_train)
y_pred = rf.predict(X_test)
print(" ----- Random Forest -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- Random Forest -----
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      1086
           1       0.66      0.53      0.59       818

    accuracy                           0.68      1904
   macro avg       0.68      0.66      0.66      1904
weighted avg       0.68      0.68      0.67      1904



In [7]:
svm = SVC(random_state=SEED)
svm.fit(X=X_train, y=y_train)
y_pred = svm.predict(X_test)
print(" ----- SVM -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- SVM -----
              precision    recall  f1-score   support

           0       0.63      0.87      0.73      1086
           1       0.66      0.32      0.43       818

    accuracy                           0.64      1904
   macro avg       0.64      0.60      0.58      1904
weighted avg       0.64      0.64      0.60      1904



In [8]:
gpc = GaussianProcessClassifier(random_state=SEED)
gpc.fit(X=X_train, y=y_train)
y_pred = gpc.predict(X_test)
print(" ----- GPC -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- GPC -----
              precision    recall  f1-score   support

           0       0.63      0.87      0.73      1086
           1       0.65      0.33      0.43       818

    accuracy                           0.63      1904
   macro avg       0.64      0.60      0.58      1904
weighted avg       0.64      0.63      0.60      1904



In [9]:
ensemble = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=SEED)),
        ('rf', RandomForestClassifier(random_state=SEED)),
        ('svm', SVC(random_state=SEED)),
        ('gpc', GaussianProcessClassifier(random_state=SEED)),
    ],
    final_estimator=AdaBoostClassifier(estimator=LGBMClassifier(verbosity=-1, random_state=SEED), random_state=SEED),
    n_jobs=-1,
)

ensemble.fit(X=X_train, y=y_train)
y_pred = ensemble.predict(X_test)
print(" ----- Stacking Classification -----")
print(classification_report(y_true=y_test, y_pred=y_pred))



 ----- Stacking Classification -----
              precision    recall  f1-score   support

           0       0.65      0.71      0.68      1086
           1       0.56      0.50      0.53       818

    accuracy                           0.62      1904
   macro avg       0.61      0.60      0.60      1904
weighted avg       0.61      0.62      0.62      1904

