In [1]:
## Import all the required libraries and functions
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import RUSBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

In [2]:
## Import the data and split the data set into X and y variables
data = pd.read_csv(r'C:\Users\Ardon\Documents\Thesis\data_2steps_1run.csv')
data = data.drop(['Unnamed: 0'], axis=1)
X = data.loc[:, data.columns != 'SheetBreak']
y = data.loc[:, data.columns == 'SheetBreak']

In [3]:
## Split the data into a training set and validation set 
X_train1, X_val1, y_train1, y_val1 = train_test_split(X, y, test_size = 0.2, random_state = 92)

## Ravel the y data. Ravel means turning a data frame into an array
y_train1 = np.ravel(y_train1)
y_val1 = np.ravel(y_val1)

## Scale the data
scaler = StandardScaler()
X_train1 = scaler.fit_transform(X_train1)
X_val1 = scaler.fit_transform(X_val1)

In [5]:
## RusBoost with a decision tree as base estimator 
start = time.time()
clf_rus = RUSBoostClassifier(random_state = 12, base_estimator = DecisionTreeClassifier())

param = {'n_estimators': [10, 25, 50, 100],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__min_samples_leaf': [2,3,4]}

search = GridSearchCV(clf_rus, param, cv=5)
found = search.fit(X_train1, y_train1)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)
stop = time.time()
print(stop-start)

[[3976   34]
 [  48    7]]
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      4010
         1.0       0.17      0.13      0.15        55

    accuracy                           0.98      4065
   macro avg       0.58      0.56      0.57      4065
weighted avg       0.98      0.98      0.98      4065

{'base_estimator__min_samples_leaf': 3, 'learning_rate': 0.1, 'n_estimators': 100}
314.64184737205505


In [10]:
## RusBoost with a random forest classifier as base estimator 
clf_rus = RUSBoostClassifier(random_state = 124, base_estimator = RandomForestClassifier())

param = {'n_estimators': [10, 25, 50, 100],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__min_samples_split': [2,3,4]}


search = GridSearchCV(clf_rus, param, cv=5)
found = search.fit(X_train1, y_train1)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

[[3982   28]
 [  46    9]]
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      4010
         1.0       0.24      0.16      0.20        55

    accuracy                           0.98      4065
   macro avg       0.62      0.58      0.59      4065
weighted avg       0.98      0.98      0.98      4065

{'base_estimator__min_samples_split': 2, 'learning_rate': 1, 'n_estimators': 100}


In [8]:
## RusBoost with a support vector machine as base estimator 
clf_rus = RUSBoostClassifier(random_state = 96, base_estimator = LinearSVC(), algorithm='SAMME')

param = {'n_estimators': [10, 25, 50, 100],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}

search = GridSearchCV(clf_rus, param, cv=5)
found = search.fit(X_train1, y_train1)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifi

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifi

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.

ValueError: BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.



[[2268 1742]
 [  16   39]]
              precision    recall  f1-score   support

         0.0       0.99      0.57      0.72      4010
         1.0       0.02      0.71      0.04        55

    accuracy                           0.57      4065
   macro avg       0.51      0.64      0.38      4065
weighted avg       0.98      0.57      0.71      4065

{'learning_rate': 0.001, 'n_estimators': 10}


In [9]:
## Manual resampling of the data
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train1, y_train1)

In [14]:
## AdaBoost with a decision tree as base estimator 
clf_ada = AdaBoostClassifier(random_state = 13, base_estimator = DecisionTreeClassifier())

param = {'n_estimators': [10, 25, 50, 100, 150],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__min_samples_leaf': [2]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

[[2977 1033]
 [   6   49]]
              precision    recall  f1-score   support

         0.0       1.00      0.74      0.85      4010
         1.0       0.05      0.89      0.09        55

    accuracy                           0.74      4065
   macro avg       0.52      0.82      0.47      4065
weighted avg       0.99      0.74      0.84      4065

{'base_estimator__min_samples_leaf': 2, 'learning_rate': 0.1, 'n_estimators': 150}


In [18]:
## AdaBoost with a random forest classifier as base estimator 
clf_ada = AdaBoostClassifier(random_state = 13, base_estimator = RandomForestClassifier())

param = {'n_estimators': [10, 25, 50, 100, 150],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

[[2957 1053]
 [   7   48]]
              precision    recall  f1-score   support

         0.0       1.00      0.74      0.85      4010
         1.0       0.04      0.87      0.08        55

    accuracy                           0.74      4065
   macro avg       0.52      0.81      0.47      4065
weighted avg       0.98      0.74      0.84      4065

{'learning_rate': 0.001, 'n_estimators': 10}


In [19]:
## AdaBoost with a support vector machine as base estimator 
clf_ada = AdaBoostClassifier(random_state = 135, base_estimator = LinearSVC(), algorithm='SAMME')

param = {'n_estimators': [10, 25, 50, 100, 200],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'base_estimator__C' : [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 5]}

search = GridSearchCV(clf_ada, param, cv=5)
found = search.fit(X_res, y_res)  
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

[[2511 1499]
 [  17   38]]
              precision    recall  f1-score   support

         0.0       0.99      0.63      0.77      4010
         1.0       0.02      0.69      0.05        55

    accuracy                           0.63      4065
   macro avg       0.51      0.66      0.41      4065
weighted avg       0.98      0.63      0.76      4065

{'base_estimator__C': 0.5, 'learning_rate': 0.001, 'n_estimators': 50}


In [114]:
data_dmatrix = xgb.DMatrix(data=X_res,label=y_res)

In [23]:
## AdaBoost with a support vector machine as base estimator 
clf_xgb = xgb.XGBClassifier(random_state=53)

param = {'n_estimators': [10, 25, 50, 100, 200],
         'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
         'gamma': [0,0.5,1],
         'max_depth': [2,4,6,8,10]}

search = GridSearchCV(clf_xgb, param, cv=5)
found = search.fit(X_res, y_res) 
predicted = found.predict(X_val1)
print(confusion_matrix(y_val1, predicted))
print(classification_report(y_val1, predicted))
print(found.best_params_)

[[2881 1129]
 [   9   46]]
              precision    recall  f1-score   support

         0.0       1.00      0.72      0.84      4010
         1.0       0.04      0.84      0.07        55

    accuracy                           0.72      4065
   macro avg       0.52      0.78      0.45      4065
weighted avg       0.98      0.72      0.82      4065

{'gamma': 0, 'learning_rate': 0.5, 'max_depth': 8, 'n_estimators': 25}
