In [1]:
## Import all the required libraries and functions
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import RUSBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

In [2]:
## Import the data and split the data set into X and y variables
data = pd.read_csv(r'C:\Users\Ardon\Documents\Thesis\data_1step_5runs.csv')
data = data.drop(['Unnamed: 0'], axis=1)
X = data.loc[:, data.columns != 'SheetBreak']
y = data.loc[:, data.columns == 'SheetBreak']

In [3]:
## Split the data into a training set and validation set 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 92)

## Ravel the y data. Ravel means turning a data frame into an array
y_train = np.ravel(y_train)
y_val = np.ravel(y_val)

## Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)

In [4]:
## RusBoost with a decision tree as base estimator 
start = time.time()
clf_rus = RUSBoostClassifier(random_state = 166, base_estimator = DecisionTreeClassifier())

found = clf_rus.fit(X_train, y_train)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
stop = time.time()
print(stop-start)

[[3877  133]
 [  45   10]]
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      4010
         1.0       0.07      0.18      0.10        55

    accuracy                           0.96      4065
   macro avg       0.53      0.57      0.54      4065
weighted avg       0.98      0.96      0.97      4065

1.0799987316131592


In [51]:
## RusBoost with a random forest classifier as base estimator 
clf_rus = RUSBoostClassifier(random_state = 24, base_estimator = RandomForestClassifier())

found = clf_rus.fit(X_train, y_train)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
#print(found.best_params_)

[[3913   97]
 [  37   18]]
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98      4010
         1.0       0.16      0.33      0.21        55

    accuracy                           0.97      4065
   macro avg       0.57      0.65      0.60      4065
weighted avg       0.98      0.97      0.97      4065



In [18]:
## RusBoost with a support vector machine as base estimator 
start = time.time()
clf_rus = RUSBoostClassifier(random_state = 78, base_estimator = LinearSVC(dual=False), algorithm='SAMME')

param = {'n_estimators': [10, 25, 50, 100],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__C' : [1000,2000,3000],
         'base_estimator__max_iter': [2000]}

search = GridSearchCV(clf_rus, param, cv=5)
found = search.fit(X_train, y_train)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
print(found.best_params_)
stop = time.time()
print(stop-start)

[[2839 1171]
 [  11   44]]
              precision    recall  f1-score   support

         0.0       1.00      0.71      0.83      4010
         1.0       0.04      0.80      0.07        55

    accuracy                           0.71      4065
   macro avg       0.52      0.75      0.45      4065
weighted avg       0.98      0.71      0.82      4065

{'base_estimator__C': 1000, 'base_estimator__max_iter': 2000, 'learning_rate': 0.1, 'n_estimators': 100}
214.758150100708


In [19]:
## Manual resampling of the data
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

In [39]:
## AdaBoost with a decision tree as base estimator
start = time.time()
clf_ada = AdaBoostClassifier(random_state = 145, base_estimator = DecisionTreeClassifier())

param = {'n_estimators': [100, 150, 200],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__min_samples_leaf': [1,2,3],
         'base_estimator__max_depth': [5, 7, 9]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
stop = time.time()
print(stop-start)
print(found.best_params_)

[[2929 1081]
 [   4   51]]
              precision    recall  f1-score   support

         0.0       1.00      0.73      0.84      4010
         1.0       0.05      0.93      0.09        55

    accuracy                           0.73      4065
   macro avg       0.52      0.83      0.46      4065
weighted avg       0.99      0.73      0.83      4065

1341.4775426387787
{'base_estimator__max_depth': 7, 'base_estimator__min_samples_leaf': 1, 'learning_rate': 1, 'n_estimators': 200}


In [33]:
## AdaBoost with a random forest classifier as base estimator 
start  = time.time()
clf_ada = AdaBoostClassifier(random_state = 676, base_estimator = RandomForestClassifier())

param = {'n_estimators': [10,20,50,100],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
print(found.best_params_)
stop = time.time()
print(stop-start)

[[2856 1154]
 [   7   48]]
              precision    recall  f1-score   support

         0.0       1.00      0.71      0.83      4010
         1.0       0.04      0.87      0.08        55

    accuracy                           0.71      4065
   macro avg       0.52      0.79      0.45      4065
weighted avg       0.98      0.71      0.82      4065

{'learning_rate': 0.001, 'n_estimators': 10}
22.65563941001892


In [29]:
## AdaBoost with a support vector machine as base estimator
start = time.time()
clf_ada = AdaBoostClassifier(random_state = 2, base_estimator = LinearSVC(), algorithm='SAMME')

param = {'n_estimators': [10, 25, 50, 100, 200],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
print(found.best_params_)
stop = time.time()
print(stop-start)

[[2677 1333]
 [  14   41]]
              precision    recall  f1-score   support

         0.0       0.99      0.67      0.80      4010
         1.0       0.03      0.75      0.06        55

    accuracy                           0.67      4065
   macro avg       0.51      0.71      0.43      4065
weighted avg       0.98      0.67      0.79      4065

{'learning_rate': 1, 'n_estimators': 25}
30.74560284614563


In [37]:
## XGBoost with a support vector machine as base estimator 
clf_xgb = xgb.XGBClassifier(random_state=7)

param = {'n_estimators': [10, 25, 50, 100, 200],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}

search = GridSearchCV(clf_xgb, param, cv=5)
found = search.fit(X_res, y_res) 
predicted = found.predict(X_val)
print(confusion_matrix(y_val, predicted))
print(classification_report(y_val, predicted))
print(found.best_params_)

[[2604 1406]
 [   7   48]]
              precision    recall  f1-score   support

         0.0       1.00      0.65      0.79      4010
         1.0       0.03      0.87      0.06        55

    accuracy                           0.65      4065
   macro avg       0.52      0.76      0.43      4065
weighted avg       0.98      0.65      0.78      4065

{'learning_rate': 0.1, 'n_estimators': 200}
