In [1]:
import os, sys
import pandas as pd

# Train test splitting and performance
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from lightgbm.sklearn import LGBMClassifier

SEED = 100

train_df = pd.read_parquet("../data/sentence_train.pq")
X, y = train_df.drop(columns=['target']).to_numpy(), train_df['target'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, stratify=train_df['target'])

In [3]:
lgbm = LGBMClassifier(verbosity=-1, random_state=SEED)
lgbm.fit(X=X_train, y=y_train)
y_pred = lgbm.predict(X_test)
print(" ----- LightGBM -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- LightGBM -----
              precision    recall  f1-score   support

           0       0.80      0.87      0.84      1086
           1       0.81      0.72      0.76       818

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.80      1904
weighted avg       0.80      0.80      0.80      1904



In [6]:
rf = RandomForestClassifier(random_state=SEED)
rf.fit(X=X_train, y=y_train)
y_pred = rf.predict(X_test)
print(" ----- Random Forest -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- Random Forest -----
              precision    recall  f1-score   support

           0       0.78      0.90      0.84      1086
           1       0.83      0.66      0.74       818

    accuracy                           0.80      1904
   macro avg       0.81      0.78      0.79      1904
weighted avg       0.80      0.80      0.79      1904



In [7]:
svm = SVC(random_state=SEED)
svm.fit(X=X_train, y=y_train)
y_pred = svm.predict(X_test)
print(" ----- SVM -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- SVM -----
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1086
           1       0.83      0.72      0.77       818

    accuracy                           0.82      1904
   macro avg       0.82      0.80      0.81      1904
weighted avg       0.82      0.82      0.81      1904



In [8]:
gpc = GaussianProcessClassifier(random_state=SEED)
gpc.fit(X=X_train, y=y_train)
y_pred = gpc.predict(X_test)
print(" ----- GPC -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- GPC -----
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1086
           1       0.79      0.71      0.75       818

    accuracy                           0.80      1904
   macro avg       0.79      0.79      0.79      1904
weighted avg       0.80      0.80      0.79      1904



In [2]:
ensemble = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(random_state=SEED)),
        ('svm', SVC(random_state=SEED)),
        ('gpc', GaussianProcessClassifier(random_state=SEED)),
    ],
    final_estimator=LGBMClassifier(verbosity=-1, random_state=SEED),
    n_jobs=-1,
)

ensemble.fit(X=X_train, y=y_train)
y_pred = ensemble.predict(X_test)
print(" ----- Stacking Classification -----")
print(classification_report(y_true=y_test, y_pred=y_pred))

 ----- Stacking Classification -----
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1086
           1       0.81      0.71      0.75       818

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904

