In [3]:
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [4]:
X_train = np.load('~/Classification/x_train.npy')
X_test = np.load('~/Classification/x_test.npy')
Y_test = np.load('~/Classification/Y_test.npy')
Y_train = np.load('~/Classification/Y_train.npy')

In [5]:
# Running the logistic regression algorithm and checking with the best parameter setting
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [6]:
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [7]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.970359 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
0.969579 (0.020370) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.560842 (0.170707) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.951638 (0.024230) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.969579 (0.020370) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.560842 (0.170707) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.952418 (0.025486) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.969579 (0.020370) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.560842 (0.170707) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.952418 (0.025486) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.968799 (0.023405) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.560842 (0.170707) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.952418 (0.025486) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.970359 (0.023842) wi

In [8]:
# Running the KNN algorithm and checking with the best parameter setting
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean']

In [9]:
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [10]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.634945 using {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
0.618565 (0.072510) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.618565 (0.072510) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.623245 (0.063864) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.620125 (0.068068) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.606864 (0.065572) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.623245 (0.074998) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.600624 (0.073418) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.629485 (0.070529) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.591264 (0.079984) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.625585 (0.074266) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.605304

In [11]:
# Running the support vector machine algorithm and checking with the best parameter setting
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

In [12]:
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [13]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.538222 using {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.507540 (0.060487) with: {'C': 50, 'gamma': 'scale', 'kernel': 'poly'}
0.525221 (0.065794) with: {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.470619 (0.070788) with: {'C': 50, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.507540 (0.060487) with: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.538222 (0.063038) with: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.480499 (0.066870) with: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.506500 (0.058664) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}
0.509100 (0.041347) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.495060 (0.052621) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.513781 (0.065654) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
0.507020 (0.013722) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
0.510140 (0.003588) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.511700 (0.005974) with: {'C': 0.01, 'gamma': 'scale', 

In [14]:
# Running the Decision Tree with Bagging algorithm and checking with the best parameter setting
model = BaggingClassifier()
n_estimators = [10, 100, 1000]

In [15]:
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [16]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.972959 using {'n_estimators': 1000}
0.966719 (0.018167) with: {'n_estimators': 10}
0.971919 (0.019764) with: {'n_estimators': 100}
0.972959 (0.021884) with: {'n_estimators': 1000}


In [17]:
# Running the Random Forest algorithm and checking with the best parameter setting
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [18]:
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [19]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.981799 using {'max_features': 'log2', 'n_estimators': 1000}
0.977119 (0.016915) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.981279 (0.017678) with: {'max_features': 'sqrt', 'n_estimators': 100}
0.980759 (0.017357) with: {'max_features': 'sqrt', 'n_estimators': 1000}
0.975039 (0.019884) with: {'max_features': 'log2', 'n_estimators': 10}
0.979719 (0.015626) with: {'max_features': 'log2', 'n_estimators': 100}
0.981799 (0.017542) with: {'max_features': 'log2', 'n_estimators': 1000}


In [20]:
# Running the XGBoost algorithm and checking with the best parameter setting
model = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

In [21]:
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, Y_train)

In [22]:
# Showing the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.980759 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.510140 (0.003588) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.510140 (0.003588) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.510140 (0.003588) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.965679 (0.024761) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.964119 (0.023345) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.963079 (0.022460) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.969839 (0.022225) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5}
0.964639 (0.023295) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.7}
0.962038 (0.022163) with: {'learning_rate': 0.001, 'ma

In [23]:
# Select only the best classifiers with the best parameter settings
Classifiers = {
    'Logistic Regression': LogisticRegression(C=0.01, penalty='l2', solver='newton-cg'),
    'Bagging Classifier': BaggingClassifier(n_estimators=1000),
    'Random Forest': RandomForestClassifier(max_features='sqrt', n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(learning_rate=0.1, max_depth=7, n_estimators=100, subsample=0.5)
}

In [25]:
# Showing the main metrics
for key, classifier in Classifiers.items():
    classifier_fitted = classifier.fit(X_train, Y_train)
    y_pred = classifier_fitted.predict(X_test)
    print('\n Classifier: ', key,'\n', classification_report(Y_test, y_pred))
    print('\n Confusion Matrix: ', key,'\n', confusion_matrix(Y_test, y_pred))
    print('\n ROC Score: ', key, '\n', roc_auc_score(Y_test,y_pred))


 Classifier:  Logistic Regression 
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       144
           1       0.98      0.94      0.96       131

    accuracy                           0.96       275
   macro avg       0.96      0.96      0.96       275
weighted avg       0.96      0.96      0.96       275


 Confusion Matrix:  Logistic Regression 
 [[141   3]
 [  8 123]]

 ROC Score:  Logistic Regression 
 0.959048982188295

 Classifier:  Bagging Classifier 
               precision    recall  f1-score   support

           0       0.97      0.96      0.96       144
           1       0.95      0.96      0.96       131

    accuracy                           0.96       275
   macro avg       0.96      0.96      0.96       275
weighted avg       0.96      0.96      0.96       275


 Confusion Matrix:  Bagging Classifier 
 [[138   6]
 [  5 126]]

 ROC Score:  Bagging Classifier 
 0.9600826972010179

 Classifier:  Random Forest 
    