# Imports

In [76]:
# General Imports
from models_utils import *

# Classification Models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# Training & Test Sets

In [77]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [78]:
X_train = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection")
X_train.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Classification/y_train")
y_train = y_train.to_numpy()

In [79]:
X_test = load_from_pickle("Training_Test_Sets/Classification/X_test_feature_selection")
X_test.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Classification/y_test")
y_test = y_test.to_numpy()

In [80]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train[y_train == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train[y_train == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test[y_test == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test[y_test == 0].shape[0]})")

X_train shape: (99705, 388)
y_train shape: 99705 (Binding Count: 73498, Non-Binding Count: 26207)
X_test shape: (816, 388)
y_test shape: 816 (Binding Count: 563, Non-Binding Count: 253)


# Model Training & Testing

In [82]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Classifier (DC)

In [37]:
dummy_classifier = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyClassifier(random_state=42))
    ]
)
dummy_classifier.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DummyClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__random_state': 42,
 'model__strategy': 'prior'}

### Training

In [38]:
# dummy_classifier.fit(X_train, y_train)
#
# y_train_pred = dummy_classifier.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_classifier, 'Dataset_Files/Baseline_Models/Classification/dc.joblib')

F1 score: 0.8486919972517797
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7371546060879595
Recall score: 1.0
Precision score: 0.7371546060879595


['Dataset_Files/Baseline_Models/Classification/dc.joblib']

In [None]:
# Load Model
dummy_classifier = load('Dataset_Files/Baseline_Models/Classification/dc.joblib')

In [41]:
y_train_pred = dummy_classifier.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8486919972517797
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7371546060879595
Recall score: 1.0
Precision score: 0.7371546060879595


### Testing

In [43]:
y_test_pred = dummy_classifier.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.8165337200870195
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.6899509803921569
Recall score: 1.0
Precision score: 0.6899509803921569


## Logistic Regression (LR)

In [44]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LogisticRegression(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 100,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [45]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=[
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['newton-cg', 'lbfgs', 'sag']),
                           'model__penalty': Categorical(['none', 'l2']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['liblinear']),
                           'model__penalty': Categorical(['l2', 'l1']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__l1_ratio': Real(0, 1),
                           'model__solver': Categorical(['saga']),
                           'model__penalty': Categorical(['none', 'l2', 'l1', 'elasticnet']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                      ],
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [46]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lr = model.best_estimator_
#
# y_train_pred = optimised_lr.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lr, 'Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33
Iteration Completed: 34
Iteration Completed: 35
Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
I

In [47]:
# Load Model
optimised_lr = load('Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')

In [48]:
y_train_pred = optimised_lr.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8822358119123873
Matthews Correlation Coefficient: 0.48789256660015523
Accuracy score: 0.8167092924126172
Recall score: 0.9313722822389725
Precision score: 0.8380241170349514


In [50]:
optimised_lr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LogisticRegression(C=0.08691067180260512, max_iter=4439, random_state=42,
                      solver='liblinear'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(C=0.08691067180260512, max_iter=4439, random_state=42,
                    solver='liblinear'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.08691067180260512,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 4439,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'liblinear',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [51]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results.npy", allow_pickle=True).tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
logistic_regression_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__max_iter,param_model__penalty,param_model__solver,param_model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
66,55.793100,2.523845,0.132408,0.043119,0.086911,,4439,l2,liblinear,,"{'model__C': 0.08691067180260512, 'model__clas...",0.881027,0.879426,0.882736,0.881267,0.880858,0.881063,0.001055,1
26,2815.406966,2.884060,0.092576,0.014220,0.172047,,5000,l2,sag,,"{'model__C': 0.17204723798386953, 'model__clas...",0.881012,0.879418,0.882679,0.881265,0.880830,0.881041,0.001040,2
110,99.473659,44.479059,0.077112,0.012028,0.458579,,3710,l1,saga,0.709831,"{'model__C': 0.4585792720221234, 'model__class...",0.880919,0.879570,0.882601,0.881239,0.880801,0.881026,0.000970,3
70,249.368894,19.934678,0.112724,0.023588,0.276555,,50,l1,liblinear,,"{'model__C': 0.2765547591820871, 'model__class...",0.880999,0.879606,0.882495,0.881154,0.880868,0.881024,0.000918,4
148,27.944436,0.060402,0.090997,0.005039,2.752098,,50,l2,saga,0.0,"{'model__C': 2.7520978985242417, 'model__class...",0.880781,0.879539,0.882721,0.881100,0.880907,0.881010,0.001016,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,5.015475,0.039212,0.204767,0.003667,0.000002,,1273,l2,liblinear,,"{'model__C': 2.195551082864074e-06, 'model__cl...",0.738634,0.740038,0.748189,0.743261,0.738875,0.741799,0.003595,146
67,3.546709,0.147501,0.118602,0.020674,0.000001,balanced,1764,l2,liblinear,,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.693792,0.693219,0.703228,0.698660,0.692990,0.696378,0.004007,147
101,2.940660,0.183646,0.072081,0.006645,0.000016,balanced,4596,l1,saga,0.473675,"{'model__C': 1.6285217534593228e-05, 'model__c...",0.000000,0.848705,0.848705,0.848672,0.000000,0.509217,0.415774,148
86,2.425969,0.030663,0.182416,0.004853,0.000009,balanced,4473,l1,liblinear,,"{'model__C': 9.132486222789627e-06, 'model__cl...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,149


### Testing

In [52]:
y_test_pred = optimised_lr.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.8147527242246437
Matthews Correlation Coefficient: 0.3242109535725276
Accuracy score: 0.7291666666666666
Recall score: 0.8632326820603907
Precision score: 0.7714285714285715


## Linear Support Vector Classification (LSVC)

In [53]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVC(random_state=42, penalty='l2'))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVC(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'squared_hinge',
 'model__max_iter': 1000,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [54]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={'model__loss': Categorical(['hinge', 'squared_hinge']),
                                     'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                                     'model__class_weight': Categorical([None, "balanced"]),
                                     'model__max_iter': Integer(500, 5000)},
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [55]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvc = model.best_estimator_
#
# y_train_pred = optimised_lsvc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvc, 'Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33
Iteration Completed: 34
Iteration Completed: 35
Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
I



F1 score: 0.8838442525599461
Matthews Correlation Coefficient: 0.49138954626923115
Accuracy score: 0.8185346772980292
Recall score: 0.9365697025769408
Precision score: 0.836738911106519


In [56]:
# Load Model
optimised_lsvc = load('Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')

In [57]:
y_train_pred = optimised_lsvc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8838442525599461
Matthews Correlation Coefficient: 0.49138954626923115
Accuracy score: 0.8185346772980292
Recall score: 0.9365697025769408
Precision score: 0.836738911106519


In [58]:
optimised_lsvc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LinearSVC(C=0.1380447014995764, loss='hinge', max_iter=3709, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(C=0.1380447014995764, loss='hinge', max_iter=3709, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.1380447014995764,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'hinge',
 'model__max_iter': 3709,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [59]:
lsvc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", allow_pickle=True).tolist())
lsvc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
36,118.052291,1.109156,0.267848,0.035558,0.138045,,hinge,3709,"{'model__C': 0.1380447014995764, 'model__class...",0.881801,0.882608,0.88287,0.882889,0.88374,0.882782,0.000621,1
27,68.849699,0.87966,0.278061,0.048183,0.07581,,hinge,2099,"{'model__C': 0.07580988177176091, 'model__clas...",0.882007,0.882444,0.882729,0.882889,0.883684,0.88275,0.000554,2
47,26.869208,0.263102,0.110933,0.018699,0.045751,,hinge,3711,"{'model__C': 0.045751052954376946, 'model__cla...",0.882002,0.882623,0.88273,0.88279,0.883463,0.882722,0.000465,3
35,141.892981,1.392536,0.277655,0.025748,0.172145,,hinge,4946,"{'model__C': 0.1721453720296977, 'model__class...",0.881786,0.882472,0.882821,0.882866,0.883576,0.882704,0.000583,4
37,42.51237,0.342727,0.353301,0.097358,0.052407,,hinge,500,"{'model__C': 0.052407004122209624, 'model__cla...",0.88179,0.882557,0.882779,0.882911,0.883441,0.882696,0.000538,5
44,169.83109,1.389224,0.118468,0.02009,0.538708,,hinge,5000,"{'model__C': 0.5387084648932836, 'model__class...",0.881716,0.882493,0.882793,0.882941,0.883478,0.882684,0.00058,6
11,296.117881,2.094355,0.175879,0.038099,0.743033,,hinge,4770,"{'model__C': 0.7430332959877312, 'model__class...",0.881729,0.882506,0.882689,0.882601,0.883685,0.882642,0.000623,7
14,28.360347,0.292943,0.18714,0.035945,0.032194,,hinge,5000,"{'model__C': 0.03219377091560028, 'model__clas...",0.881711,0.882211,0.882432,0.88282,0.883478,0.882531,0.000594,8
28,25.389982,0.158025,0.332762,0.015867,0.014692,,hinge,2867,"{'model__C': 0.01469227089101338, 'model__clas...",0.881939,0.882255,0.8821,0.882593,0.883252,0.882428,0.000465,9
15,13.52466,0.224726,0.160166,0.035083,0.013968,,hinge,513,"{'model__C': 0.013967868241974836, 'model__cla...",0.881718,0.882234,0.882013,0.88268,0.88321,0.882371,0.000524,10


### Testing

In [60]:
y_test_pred = optimised_lsvc.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.82
Matthews Correlation Coefficient: 0.33616222528740924
Accuracy score: 0.7352941176470589
Recall score: 0.8738898756660746
Precision score: 0.7723704866562009


## K-Nearest Neighbors Classifier (KNNC)

In [61]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsClassifier())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [62]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_neighbors': Integer(4, 20),
                       'model__weights': Categorical(['uniform', 'distance']),
                       'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [63]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnc = model.best_estimator_
#
# y_train_pred = optimised_knnc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnc, 'Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20




Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33




Iteration Completed: 34
Iteration Completed: 35




Iteration Completed: 36




Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
Iteration Completed: 43




Iteration Completed: 44




Iteration Completed: 45




Iteration Completed: 46
Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
F1 score: 0.9999795908648712
Matthews Correlation Coefficient: 0.9999223605385762
Accuracy score: 0.9999699112381526
Recall score: 0.9999591825627908
Precision score: 1.0


In [64]:
# Load Model
optimised_knnc = load('Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')

In [65]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_train_pred.npy"):
    y_train_pred = np.load("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_train_pred.npy")
else:
    y_train_pred = optimised_knnc.predict(X_train)
    np.save("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_train_pred.npy", y_train_pred)

calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.9999795908648712
Matthews Correlation Coefficient: 0.9999223605385762
Accuracy score: 0.9999699112381526
Recall score: 0.9999591825627908
Precision score: 1.0


In [105]:
optimised_knnc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=19))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=19),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'kd_tree',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 19,
 'model__p': 2,
 'model__weights': 'uniform'}

In [106]:
knnc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", allow_pickle=True).tolist())
knnc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__algorithm,param_model__n_neighbors,param_model__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,1.825007,0.138253,102.614121,2.309688,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
31,1.844019,0.216043,98.452453,2.959144,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
33,14.51648,1.23451,3757.279202,56.727425,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
21,1.781852,0.093442,101.883838,3.232928,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
20,14.251192,1.246291,3743.388045,62.238347,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
35,1.799546,0.096984,102.92475,2.300177,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
39,14.346333,1.081688,3742.341087,62.568789,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
40,1.830748,0.055993,103.154682,2.61164,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
48,14.333448,1.225579,3749.428631,66.386132,kd_tree,19,uniform,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1
42,1.818891,0.178948,104.319416,2.32652,auto,19,uniform,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.616333,0.509407,0.618646,0.677035,0.792526,0.642789,0.092377,1


### Testing

In [66]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_test_pred.npy"):
    y_test_pred = np.load("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_test_pred.npy")
else:
    y_test_pred = optimised_knnc.predict(X_test)
    np.save("Dataset_Files/Baseline_Models/Classification/optimised_knnc_y_test_pred.npy", y_test_pred)

calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.8232189973614776
Matthews Correlation Coefficient: 0.4174960141149641
Accuracy score: 0.7536764705882353
Recall score: 0.8312611012433393
Precision score: 0.8153310104529616


## Decision Tree Classifier (DTC)

In [67]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'best'}

In [68]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__criterion': Categorical(['gini', 'entropy']),
                       'model__splitter': Categorical(['best', 'random']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [69]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_dtc = model.best_estimator_
#
# y_train_pred = optimised_dtc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_dtc, 'Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16




Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19




Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23




Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27




Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31




Iteration Completed: 32




Iteration Completed: 33




Iteration Completed: 34




Iteration Completed: 35




Iteration Completed: 36




Iteration Completed: 37




Iteration Completed: 38




Iteration Completed: 39




Iteration Completed: 40




Iteration Completed: 41




Iteration Completed: 42




Iteration Completed: 43




Iteration Completed: 44




Iteration Completed: 45




Iteration Completed: 46




Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
F1 score: 0.9999795908648712
Matthews Correlation Coefficient: 0.9999223605385762
Accuracy score: 0.9999699112381526
Recall score: 0.9999591825627908
Precision score: 1.0


In [111]:
# Load Model
optimised_dtc = load('Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')

In [112]:
y_train_pred = optimised_dtc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.9999949373497026
Matthews Correlation Coefficient: 0.9999810376365187
Accuracy score: 0.9999925779684415
Recall score: 0.9999898747506657
Precision score: 1.0


In [113]:
optimised_dtc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   DecisionTreeClassifier(criterion='entropy', max_features='log2',
                          random_state=42, splitter='random'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(criterion='entropy', max_features='log2',
                        random_state=42, splitter='random'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'entropy',
 'model__max_depth': None,
 'model__max_features': 'log2',
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'random'}

In [114]:
dtc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", allow_pickle=True).tolist())
dtc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,1.962907,0.027542,0.125217,0.004432,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
29,1.970825,0.101165,0.108212,0.005262,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
32,2.004859,0.097391,0.109537,0.005384,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
27,1.951181,0.061655,0.112754,0.007601,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
33,1.962594,0.060569,0.11184,0.006574,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
34,1.991534,0.09933,0.107633,0.005263,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
48,1.968533,0.065223,0.107409,0.003843,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
23,1.952895,0.110128,0.107195,0.005618,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
35,1.959555,0.079155,0.106977,0.006517,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1
36,1.980895,0.089248,0.10818,0.00284,,entropy,log2,random,"{'model__class_weight': None, 'model__criterio...",0.570013,0.615882,0.531489,0.615077,0.750259,0.616544,0.073867,1


### Testing

In [115]:
y_test_pred = optimised_dtc.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.924736055099914
Matthews Correlation Coefficient: 0.7190178962152716
Accuracy score: 0.8897846786768853
Recall score: 0.9275942002636244
Precision score: 0.9218954691240909


## Random Forest Classifier (RFC)

In [70]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [71]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_estimators': Integer(100, 800),
                       'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced', 'balanced_subsample'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [72]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_rfc = model.best_estimator_
#
# y_train_pred = optimised_rfc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_rfc, 'Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19




Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23




Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33




Iteration Completed: 34




Iteration Completed: 35
Iteration Completed: 36




Iteration Completed: 37




Iteration Completed: 38
Iteration Completed: 39




Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
Iteration Completed: 43
Iteration Completed: 44




Iteration Completed: 45




Iteration Completed: 46
Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
F1 score: 0.999979591420233
Matthews Correlation Coefficient: 0.9999223540469564
Accuracy score: 0.9999699112381526
Recall score: 0.9999863941875969
Precision score: 0.9999727887454251


In [83]:
# Load Model
optimised_rfc = load('Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')

In [84]:
y_train_pred = optimised_rfc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.999979591420233
Matthews Correlation Coefficient: 0.9999223540469564
Accuracy score: 0.9999699112381526
Recall score: 0.9999863941875969
Precision score: 0.9999727887454251


In [None]:
optimised_rfc.get_params()

In [None]:
rfc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", allow_pickle=True).tolist())
rfc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfc_grid_search_dataframe

### Testing

In [87]:
y_test_pred = optimised_rfc.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.8366533864541833
Matthews Correlation Coefficient: 0.3509927517677628
Accuracy score: 0.7487745098039216
Recall score: 0.9325044404973357
Precision score: 0.7586705202312138


## Stochastic Gradient Descent Classifier (SGDC)

In [88]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'optimal',
 'model__loss': 'hinge',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [92]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__loss': Categorical(
                          ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error',
                           'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                          'model__eta0': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__class_weight': Categorical([None, 'balanced'])
                      },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [93]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_sgdc = model.best_estimator_

y_train_pred = optimised_sgdc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

# Save Model
dump(optimised_sgdc, 'Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')
np.save("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33
Iteration Completed: 34
Iteration Completed: 35
Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
I

In [94]:
# Load Model
optimised_sgdc = load('Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')

In [95]:
y_train_pred = optimised_sgdc.predict(X_train)
calculate_metrics_classification(y_train, y_train_pred)

F1 score: 0.8824109461906819
Matthews Correlation Coefficient: 0.4888642293368735
Accuracy score: 0.8170101800310917
Recall score: 0.9314130996761817
Precision score: 0.8383071478429116


In [96]:
optimised_sgdc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
                 loss='log_loss', penalty='elasticnet', random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
               loss='log_loss', penalty='elasticnet', random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 1e-06,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0009866506104658564,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'adaptive',
 'model__loss': 'log_loss',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'elasticnet',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': Tru

In [None]:
sgdc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", allow_pickle=True).tolist())
sgdc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdc_grid_search_dataframe

### Testing

In [97]:
y_test_pred = optimised_sgdc.predict(X_test)
calculate_metrics_classification(y_test, y_test_pred)

F1 score: 0.8147527242246437
Matthews Correlation Coefficient: 0.3242109535725276
Accuracy score: 0.7291666666666666
Recall score: 0.8632326820603907
Precision score: 0.7714285714285715
