# Note

**Since only one molecular function was selected by feature selection we decided to just outright ignore it and build models with just the selected drug descriptors and protein sequence descriptors.**

# Imports

In [74]:
# General Imports
import os
import pandas as pd
import numpy as np
from modify_dataset import load_from_pickle, load_to_pickle

# Plotting libraries
import plotly.express as px

# Dimensionality reduction
from sklearn.decomposition import PCA

# Scalers
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_absolute_error, r2_score

# Classification Models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Error Analysis
# import eli5
# from lime.lime_tabular import LimeTabularExplainer

# Scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Other Model Stuff
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Training & Test Sets

In [75]:
feature_selection_columns = np.load("Dataset_Files/Feature_Selection/features_dd_psd_list.npy",
                                    allow_pickle=True)

In [76]:
X_train = np.load("Dataset_Files/Training_Test_Sets/X_train_dd_psd_feature_selection.npy")
y_train = np.load("Dataset_Files/Training_Test_Sets/y_train_dd_psd.npy")

X_test_classification = np.load("Dataset_Files/Training_Test_Sets/X_test_classification_dd_psd_feature_selection.npy")
y_test_classification = np.load("Dataset_Files/Training_Test_Sets/y_test_classification.npy")

In [77]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]}")

print(f"X_test_classification shape: {X_test_classification.shape}")
print(f"y_test_classification shape: {y_test_classification.shape[0]} ", end="")
print(f"(Binding Count: {y_test_classification[y_test_classification == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test_classification[y_test_classification == 0].shape[0]})")

X_train shape: (134734, 1044)
y_train shape: 134734
X_test_classification shape: (30141, 1044)
y_test_classification shape: 30141 (Binding Count: 22001, Non-Binding Count: 8140)


# Helper Functions

In [78]:
def calculate_metrics_classification(y_true, y_pred):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    print(f"F1 score: {f1}")
    print(f"Matthews Correlation Coefficient: {mcc}")
    print(f"Accuracy score: {accuracy}")
    print(f"Recall score: {recall}")
    print(f"Precision score: {precision}")

def prediction_category_classification(df):
    if df['True Class'] == df['Prediction']:
        return 'Correct'
    elif (df['True Class'] == 0) and (df['Prediction'] == 1):
        return 'False Positive'
    else:
        return 'False Negative'


def error_analysis_classification(y_pred):
    X_set = pd.DataFrame(X_test_classification, columns=feature_selection_columns)
    y_set = pd.Series(y_test_classification, name="Activity_Binary")

    # Combining data into one dataframe
    y_pred_series = pd.Series(y_pred, index=y_set.index)

    error_analysis_dataframe = pd.concat([X_set, y_set], axis=1)
    error_analysis_dataframe = pd.concat([error_analysis_dataframe, y_pred_series], axis=1)
    error_analysis_dataframe.rename(columns={"Activity_Binary": "True Class", 0: "Prediction"}, inplace=True)
    error_analysis_dataframe["Is the prediction correct?"] = error_analysis_dataframe.apply(
        prediction_category_classification, axis=1)

    # Scaling
    scaler = StandardScaler()
    scaler.fit(error_analysis_dataframe.loc[:, feature_selection_columns])
    scaled_data = scaler.transform(error_analysis_dataframe.loc[:, feature_selection_columns])

    # PCA
    pca = PCA(n_components=2, random_state=0)
    pca.fit(scaled_data)
    pca_data = pca.transform(scaled_data)
    pca_dataframe_2d = pd.DataFrame(pca_data, columns=["PCA_Dimension_1", "PCA_Dimension_2"], index=y_set.index)

    # Joining dataframes
    error_analysis_dataframe = pd.concat([error_analysis_dataframe, pca_dataframe_2d], axis=1)

    # Plot
    fig = px.scatter(error_analysis_dataframe, x="PCA_Dimension_1", y="PCA_Dimension_2",
                     color="Is the prediction correct?",
                     symbol="Is the prediction correct?",
                     hover_data=['MW', 'TPSA', 'XLogP', 'NHD', 'NHA', 'NRB', 'True Class', 'Prediction'],
                     title="Correct Classifications vs Misclassifications")
    fig.show()

    # Useful stats
    print(
        f"Number of correct classifications: {len(error_analysis_dataframe[error_analysis_dataframe['Is the prediction correct?'] == 'Correct'])}")
    print(
        f"Number of misclassifications: {len(error_analysis_dataframe[error_analysis_dataframe['Is the prediction correct?'] != 'Correct'])}")
    print(
        f"False Positives (True class:0, Prediction:1): {len(error_analysis_dataframe[(error_analysis_dataframe['True Class'] == 0) & (error_analysis_dataframe['Prediction'] == 1)])}")
    print(
        f"False Negatives (True class:1, Prediction:0): {len(error_analysis_dataframe[(error_analysis_dataframe['True Class'] == 1) & (error_analysis_dataframe['Prediction'] == 0)])}")

    return error_analysis_dataframe.sort_values('Is the prediction correct?')

# def model_weights_classification(model, classification_group):
#     if classification_group == 'cd':
#         return eli5.show_weights(model,
#                                  feature_names=X_train_cd.columns,
#                                  target_names={1:"BBB+",0:"BBB-"})
#     elif classification_group == 'cd_se_i':
#         return eli5.show_weights(model,
#                                  feature_names=X_train_cd_se_i.loc[:,feature_selection_support].columns,
#                                  target_names={1:"BBB+",0:"BBB-"})
#     else:
#         raise ValueError("Invalid group. Please choose 'cd' or 'cd_se_i'")

# def get_lime_explainer_classification(classification_group):
#     if classification_group == 'cd':
#         X_train = X_train_cd
#         y_train = y_train_cd
#     elif classification_group == 'cd_se_i':
#         X_train = X_train_cd_se_i.loc[:,feature_selection_support]
#         y_train = y_train_cd_se_i
#     else:
#         raise ValueError("Invalid group. Please choose 'cd' or 'cd_se_i'")
#
#     explainer = LimeTabularExplainer(training_data=np.array(X_train),
#                                      mode='classification',
#                                      feature_names=list(X_train.columns),
#                                      training_labels=y_train,
#                                      class_names=['BBB-','BBB+'],
#                                      random_state=42)
#     return explainer

# Model Training & Testing

In [79]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Classifier (DC)

In [80]:
dummy_classifier = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyClassifier(random_state=42))
    ]
)
dummy_classifier.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DummyClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__random_state': 42,
 'model__strategy': 'prior'}

### Training

In [81]:
# dummy_classifier.fit(X_train, y_train)
#
# y_train_pred = dummy_classifier.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_classifier, 'Dataset_Files/Classification_Models/dc.joblib')

In [82]:
# Load Model
dummy_classifier = load('Dataset_Files/Classification_Models/dc.joblib')
y_train_pred = dummy_classifier.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8459466288646107
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7330221028099811
Recall score: 1.0
Precision score: 0.7330221028099811


### Testing

In [83]:
# Load Model
dummy_classifier = load('Dataset_Files/Classification_Models/dc.joblib')

y_test_pred = dummy_classifier.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.8438878447316942
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.729935967618858
Recall score: 1.0
Precision score: 0.729935967618858


## Logistic Regression (LR)

In [84]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LogisticRegression(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 100,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [85]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=[
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['newton-cg', 'lbfgs', 'sag']),
                           'model__penalty': Categorical(['none', 'l2']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['liblinear']),
                           'model__penalty': Categorical(['l2', 'l1']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__l1_ratio': Real(0, 1),
                           'model__solver': Categorical(['saga']),
                           'model__penalty': Categorical(['none', 'l2', 'l1', 'elasticnet']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                      ],
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [86]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lr = model.best_estimator_
#
# y_train_pred = optimised_lr.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lr, 'Dataset_Files/Classification_Models/optimised_lr.joblib')
# np.save("Dataset_Files/Classification_Models/optimised_lr_cv_results", model.cv_results_)

In [87]:
# Load Model
optimised_lr = load('Dataset_Files/Classification_Models/optimised_lr.joblib')

In [88]:
y_train_pred = optimised_lr.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8459466288646107
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7330221028099811
Recall score: 1.0
Precision score: 0.7330221028099811


In [89]:
optimised_lr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LogisticRegression(C=7.917317677303884e-05, l1_ratio=0.6288012896974549,
                      max_iter=1418, penalty='elasticnet', random_state=42,
                      solver='saga'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(C=7.917317677303884e-05, l1_ratio=0.6288012896974549,
                    max_iter=1418, penalty='elasticnet', random_state=42,
                    solver='saga'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 7.917317677303884e-05,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': 0.6288012896974549,
 'model__max_iter': 1418,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'elasticnet',
 'model__random_state': 42,
 'model__solver': 'saga',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start

In [90]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_lr_cv_results.npy", allow_pickle=True).tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
logistic_regression_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__max_iter,param_model__penalty,param_model__solver,param_model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
112,6.799620,0.521389,0.196508,0.023222,0.000001,,1650,elasticnet,saga,1.0,"{'model__C': 1e-06, 'model__class_weight': Non...",0.845953,0.845953,0.845953,0.845928,0.845946,0.845947,0.000010,1
132,44.996725,22.659496,0.178698,0.028082,0.000116,,1581,l1,saga,0.0,"{'model__C': 0.00011592831105101897, 'model__c...",0.845953,0.845953,0.845953,0.845928,0.845946,0.845947,0.000010,1
104,29.651265,9.177187,0.128871,0.011190,0.000079,,1418,elasticnet,saga,0.628801,"{'model__C': 7.917317677303884e-05, 'model__cl...",0.845953,0.845953,0.845953,0.845928,0.845946,0.845947,0.000010,1
134,12.147855,0.475676,0.203419,0.031986,0.000043,,5000,l1,saga,0.541709,"{'model__C': 4.2813708334205274e-05, 'model__c...",0.845953,0.845953,0.845953,0.845928,0.845946,0.845947,0.000010,1
136,11.290133,1.609216,0.151347,0.001474,0.000047,,5000,elasticnet,saga,0.916342,"{'model__C': 4.729328825945751e-05, 'model__cl...",0.845953,0.845953,0.845953,0.845928,0.845946,0.845947,0.000010,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,7.092249,1.348663,0.223602,0.045031,0.000001,balanced,50,elasticnet,saga,0.473951,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.000000,0.000000,0.845953,0.000000,0.000000,0.169191,0.338381,146
52,8.937596,0.651621,0.254229,0.043727,0.000003,balanced,1140,l1,liblinear,,"{'model__C': 2.512509104389348e-06, 'model__cl...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,147
53,8.252943,0.957555,0.247983,0.054812,0.000013,balanced,1723,l1,liblinear,,"{'model__C': 1.293129412008041e-05, 'model__cl...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,147
64,8.779870,1.490101,0.262463,0.063719,0.000001,balanced,5000,l1,liblinear,,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,147


### Testing

In [91]:
y_test_pred = optimised_lr.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.8438878447316942
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.729935967618858
Recall score: 1.0
Precision score: 0.729935967618858


## Linear Support Vector Classification (LSVC)

In [92]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVC(random_state=42, penalty='l2'))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVC(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'squared_hinge',
 'model__max_iter': 1000,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [93]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={'model__loss': Categorical(['hinge', 'squared_hinge']),
                                     'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                                     'model__class_weight': Categorical([None, "balanced"]),
                                     'model__max_iter': Integer(500, 5000)},
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [94]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvc = model.best_estimator_
#
# y_train_pred = optimised_lsvc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvc, 'Dataset_Files/Classification_Models/optimised_lsvc.joblib')
# np.save("Dataset_Files/Classification_Models/optimised_lsvc_cv_results.npy", model.cv_results_)

In [95]:
# Load Model
optimised_lsvc = load('Dataset_Files/Classification_Models/optimised_lsvc.joblib')

In [96]:
y_train_pred = optimised_lsvc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8901331161591188
Matthews Correlation Coefficient: 0.5430409417411312
Accuracy score: 0.8317870767586504
Recall score: 0.9296092666281908
Precision score: 0.853873124819806


In [97]:
optimised_lsvc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LinearSVC(C=4.37617353977782e-05, loss='hinge', max_iter=5000, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(C=4.37617353977782e-05, loss='hinge', max_iter=5000, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 4.37617353977782e-05,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'hinge',
 'model__max_iter': 5000,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [98]:
lsvc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_lsvc_cv_results.npy", allow_pickle=True).tolist())
lsvc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
47,6.90856,0.756084,0.183487,0.024159,4.4e-05,,hinge,5000,"{'model__C': 4.37617353977782e-05, 'model__cla...",0.704096,0.711332,0.834057,0.80601,0.833461,0.777791,0.058154,1
46,7.650003,1.278189,0.155842,0.011783,4.4e-05,,hinge,5000,"{'model__C': 4.39209291215761e-05, 'model__cla...",0.704083,0.711241,0.833989,0.80602,0.833371,0.777741,0.058148,2
20,6.818466,0.229713,0.194947,0.009321,4e-05,,hinge,500,"{'model__C': 4.0427363544850065e-05, 'model__c...",0.702974,0.709114,0.835492,0.806549,0.833623,0.77755,0.059308,3
33,7.679219,0.851501,0.154826,0.009959,5.6e-05,,hinge,5000,"{'model__C': 5.555864028680978e-05, 'model__cl...",0.704236,0.7173,0.829335,0.803114,0.833356,0.777468,0.055596,4
35,7.424501,0.568367,0.15926,0.01037,5.4e-05,,hinge,500,"{'model__C': 5.406787237844887e-05, 'model__cl...",0.704041,0.716445,0.830031,0.80327,0.833548,0.777467,0.056016,5
45,6.767816,0.593612,0.16423,0.01754,3.2e-05,,hinge,5000,"{'model__C': 3.240241537571675e-05, 'model__cl...",0.699418,0.703263,0.840303,0.808994,0.83348,0.777092,0.062733,6
44,7.176641,0.539984,0.165959,0.021393,3.2e-05,,hinge,5000,"{'model__C': 3.237600998194821e-05, 'model__cl...",0.69931,0.70319,0.840351,0.808994,0.833506,0.77707,0.062791,7
43,7.706792,1.547533,0.153423,0.008456,3.2e-05,,hinge,5000,"{'model__C': 3.2346484596066533e-05, 'model__c...",0.69933,0.703117,0.840351,0.809049,0.83348,0.777065,0.062804,8
42,7.03063,0.644369,0.166711,0.014084,3.2e-05,,hinge,5000,"{'model__C': 3.231171021097408e-05, 'model__cl...",0.69935,0.703044,0.840371,0.809049,0.83348,0.777059,0.062821,9
41,7.304444,0.80484,0.15681,0.01306,3.2e-05,,hinge,5000,"{'model__C': 3.22776815632846e-05, 'model__cla...",0.699312,0.702934,0.840343,0.809076,0.833521,0.777037,0.06286,10


### Testing

In [99]:
y_test_pred = optimised_lsvc.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.8850744968197264
Matthews Correlation Coefficient: 0.5286481499884806
Accuracy score: 0.8249560399455891
Recall score: 0.9234125721558111
Precision score: 0.8497929476722299


## K-Nearest Neighbors Classifier (KNNC)

In [100]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsClassifier())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [101]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_neighbors': Integer(4, 20),
                       'model__weights': Categorical(['uniform', 'distance']),
                       'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [102]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnc = model.best_estimator_
#
# y_train_pred = optimised_knnc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnc, 'Dataset_Files/Classification_Models/optimised_knnc.joblib')
# np.save("Dataset_Files/Classification_Models/optimised_knnc_cv_results.npy", model.cv_results_)

In [103]:
# Load Model
optimised_knnc = load('Dataset_Files/Classification_Models/optimised_knnc.joblib')

In [104]:
if os.path.exists("Dataset_Files/Classification_Models/optimised_knnc_y_train_pred.npy"):
    y_train_pred = np.load("Dataset_Files/Classification_Models/optimised_knnc_y_train_pred.npy")
else:
    y_train_pred = optimised_knnc.predict(X_train)
    np.save("Dataset_Files/Classification_Models/optimised_knnc_y_train_pred.npy", y_train_pred)

calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.9414310617749013
Matthews Correlation Coefficient: 0.7747306535746232
Accuracy score: 0.913221607018273
Recall score: 0.9514494294422
Precision score: 0.9316214742477569


In [105]:
optimised_knnc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=19))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=19),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'kd_tree',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 19,
 'model__p': 2,
 'model__weights': 'uniform'}

In [106]:
knnc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_knnc_cv_results.npy", allow_pickle=True).tolist())
knnc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__algorithm,param_model__n_neighbors,param_model__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,1.825007,0.138253,102.614121,2.309688,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
31,1.844019,0.216043,98.452453,2.959144,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
33,14.51648,1.23451,3757.279202,56.727425,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
21,1.781852,0.093442,101.883838,3.232928,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
20,14.251192,1.246291,3743.388045,62.238347,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
35,1.799546,0.096984,102.92475,2.300177,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
39,14.346333,1.081688,3742.341087,62.568789,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
40,1.830748,0.055993,103.154682,2.61164,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
48,14.333448,1.225579,3749.428631,66.386132,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
42,1.818891,0.178948,104.319416,2.32652,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1


### Testing

In [107]:
if os.path.exists("Dataset_Files/Classification_Models/optimised_knnc_y_test_pred.npy"):
    y_test_pred = np.load("Dataset_Files/Classification_Models/optimised_knnc_y_test_pred.npy")
else:
    y_test_pred = optimised_knnc.predict(X_test_classification)
    np.save("Dataset_Files/Classification_Models/optimised_knnc_y_test_pred.npy", y_test_pred)

calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.9319247246708386
Matthews Correlation Coefficient: 0.7396520734806343
Accuracy score: 0.8993065923492917
Recall score: 0.944229807736012
Precision score: 0.9199362323974847


## Decision Tree Classifier (DTC)

In [108]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'best'}

In [109]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__criterion': Categorical(['gini', 'entropy']),
                       'model__splitter': Categorical(['best', 'random']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [110]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_dtc = model.best_estimator_
#
# y_train_pred = optimised_dtc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_dtc, 'Dataset_Files/Classification_Models/optimised_dtc.joblib')
# np.save("Dataset_Files/Classification_Models/optimised_dtc_cv_results.npy", model.cv_results_)

In [111]:
# Load Model
optimised_dtc = load('Dataset_Files/Classification_Models/optimised_dtc.joblib')

In [112]:
y_train_pred = optimised_dtc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.9999949373497026
Matthews Correlation Coefficient: 0.9999810376365187
Accuracy score: 0.9999925779684415
Recall score: 0.9999898747506657
Precision score: 1.0


In [113]:
optimised_dtc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   DecisionTreeClassifier(criterion='entropy', max_features='log2',
                          random_state=42, splitter='random'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(criterion='entropy', max_features='log2',
                        random_state=42, splitter='random'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'entropy',
 'model__max_depth': None,
 'model__max_features': 'log2',
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'random'}

In [114]:
dtc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_dtc_cv_results.npy", allow_pickle=True).tolist())
dtc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,1.962907,0.027542,0.125217,0.004432,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
29,1.970825,0.101165,0.108212,0.005262,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
32,2.004859,0.097391,0.109537,0.005384,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
27,1.951181,0.061655,0.112754,0.007601,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
33,1.962594,0.060569,0.11184,0.006574,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
34,1.991534,0.09933,0.107633,0.005263,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
48,1.968533,0.065223,0.107409,0.003843,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
23,1.952895,0.110128,0.107195,0.005618,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
35,1.959555,0.079155,0.106977,0.006517,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
36,1.980895,0.089248,0.10818,0.00284,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1


### Testing

In [115]:
y_test_pred = optimised_dtc.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.924736055099914
Matthews Correlation Coefficient: 0.7190178962152716
Accuracy score: 0.8897846786768853
Recall score: 0.9275942002636244
Precision score: 0.9218954691240909


## Random Forest Classifier (RFC)

In [5]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [6]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_estimators': Integer(100, 800),
                       'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced', 'balanced_subsample'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_rfc = model.best_estimator_
#
# y_train_pred = optimised_rfc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_rfc, 'Dataset_Files/Classification_Models/optimised_rfc.joblib')
# np.save("Dataset_Files/Classification_Models/optimised_rfc_cv_results.npy", model.cv_results_)

Fitting 10 folds for each of 135 candidates, totalling 1350 fits


In [None]:
# Load Model
optimised_rfc = load('Dataset_Files/Classification_Models/optimised_rfc.joblib')

In [None]:
y_train_pred = optimised_rfc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

In [None]:
optimised_rfc.get_params()

In [None]:
rfc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_rfc_cv_results.npy", allow_pickle=True).tolist())
rfc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfc_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_rfc.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)

## Stochastic Gradient Descent Classifier (SGDC)

In [8]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'optimal',
 'model__loss': 'hinge',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__loss': Categorical(
                          ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error',
                           'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                          'model__class_weight': Categorical([None, 'balanced'])
                      },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_sgdc = model.best_estimator_

y_train_pred = optimised_sgdc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

# Save Model
dump(optimised_sgdc, 'Dataset_Files/Classification_Models/optimised_sgdc.joblib')
np.save("Dataset_Files/Classification_Models/optimised_sgdc_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_sgdc = load('Dataset_Files/Classification_Models/optimised_sgdc.joblib')

In [None]:
y_train_pred = optimised_sgdc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

In [None]:
optimised_sgdc.get_params()

In [None]:
sgdc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_sgdc_cv_results.npy", allow_pickle=True).tolist())
sgdc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdc_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_sgdc.predict(X_test_classification)
calculate_metrics_classification(y_test_classification, y_test_pred)