In [1]:
# Data wrangling
import pandas as pd

# Data modelling
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Data evaluation
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import GridSearchCV

Import Dataset

In [2]:
train = pd.read_csv('Training_Data.csv')

In [3]:
test = pd.read_csv('Test_Data.csv')

Data Modelling - RandomForest

In [None]:
rf = RandomForestClassifier()
param_rf = {
    'n_estimators': [400, 500, 600],
    'max_depth' : [5,10, 15, None]
}
grid_rf = GridSearchCV(rf, param_grid = param_rf, cv = 5, n_jobs = -1, scoring = precision_score)

In [None]:
grid_rf.fit(train[['roc','ey']], train['label'])



GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15, None],
                         'n_estimators': [400, 500, 600]},
             scoring=<function precision_score at 0x7f9150707830>)

In [None]:
grid_rf.best_params_

{'max_depth': 5, 'n_estimators': 400}

In [None]:
pred_rf = grid_rf.predict(test[['roc','ey']])

In [None]:
print(classification_report(pred_rf, test['label']))

              precision    recall  f1-score   support

           0       0.95      0.78      0.86       230
           1       0.33      0.71      0.45        35

    accuracy                           0.77       265
   macro avg       0.64      0.75      0.66       265
weighted avg       0.87      0.77      0.80       265



In [None]:
accuracy_score(pred_rf, test['label'])

0.7735849056603774

Data Modelling - XGBoost

In [62]:
xgb = XGBClassifier()
param_xgb = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth' : [3, 5, 10]#, 15, None]        
}
grid_xgb = GridSearchCV(xgb, param_grid = param_xgb, cv = 5, n_jobs = -1) #, scoring = precision_score)

In [63]:
grid_xgb.fit(train[['roc','ey']], train['label'])

GridSearchCV(cv=5, estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 10],
                         'n_estimators': [100, 200, 500, 1000]})

In [64]:
grid_xgb.best_params_

{'max_depth': 10, 'n_estimators': 200}

In [65]:
pred_xgb = grid_xgb.predict(test[['roc','ey']])

In [66]:
print(confusion_matrix(pred_xgb, test['label']))
print(classification_report(pred_xgb, test['label']))

[[172  23]
 [ 18  52]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       195
           1       0.69      0.74      0.72        70

    accuracy                           0.85       265
   macro avg       0.80      0.81      0.81       265
weighted avg       0.85      0.85      0.85       265



In [67]:
accuracy_score(pred_xgb, test['label'])

0.8452830188679246

Data Modelling - LightGBM

In [46]:
lgb = LGBMClassifier()
param_lgb = {
    'n_estimators': [1000, 1300, 1500],
    'max_depth' : [8, 10, 15, None],
    'num_leaves' : [16, 20, 24]
}
grid_lgb = GridSearchCV(lgb, param_grid = param_lgb, cv = 5, n_jobs = -1) #, scoring = f1_score)

In [47]:
grid_lgb.fit(train[['roc','ey']], train['label'])

GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'max_depth': [8, 10, 15, None],
                         'n_estimators': [1000, 1300, 1500],
                         'num_leaves': [16, 20, 24]})

In [48]:
grid_lgb.best_params_

{'max_depth': 10, 'n_estimators': 1300, 'num_leaves': 20}

In [49]:
pred_lgb = grid_lgb.predict(test[['roc','ey']])

In [50]:
print(confusion_matrix(pred_lgb, test['label']))
print(classification_report(pred_lgb, test['label']))

[[175  22]
 [ 15  53]]
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       197
           1       0.71      0.78      0.74        68

    accuracy                           0.86       265
   macro avg       0.81      0.83      0.82       265
weighted avg       0.87      0.86      0.86       265



In [51]:
accuracy_score(pred_lgb, test['label'])

0.8603773584905661