# Imports

In [1]:
# General Imports
from models_utils import *

# Classification Models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# Training & Test Sets

In [2]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [3]:
X_train = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection")
X_train.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Classification/y_train")
y_train = y_train.to_numpy()

In [4]:
X_test = load_from_pickle("Training_Test_Sets/Classification/X_test_feature_selection")
X_test.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Classification/y_test")
y_test = y_test.to_numpy()

In [5]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train[y_train == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train[y_train == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test[y_test == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test[y_test == 0].shape[0]})")

X_train shape: (99705, 388)
y_train shape: 99705 (Binding Count: 73498, Non-Binding Count: 26207)
X_test shape: (816, 388)
y_test shape: 816 (Binding Count: 563, Non-Binding Count: 253)


# Model Training & Testing

In [6]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Classifier (DC)

In [7]:
dummy_classifier = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyClassifier(random_state=42))
    ]
)
dummy_classifier.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DummyClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__random_state': 42,
 'model__strategy': 'prior'}

### Training

In [8]:
# dummy_classifier.fit(X_train, y_train)
#
# y_train_pred = dummy_classifier.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_classifier, 'Dataset_Files/Baseline_Models/Classification/dc.joblib')

In [9]:
# Load Model
dummy_classifier = load('Dataset_Files/Baseline_Models/Classification/dc.joblib')

In [10]:
get_confidence_intervals(dummy_classifier, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 0.74 with a 95% confidence interval of [0.71,0.76]
Median F1: 0.85 with a 95% confidence interval of [0.83,0.87]
Median Accuracy: 0.74 with a 95% confidence interval of [0.71,0.76]
Median MCC: 0.00 with a 95% confidence interval of [0.00,0.00]


### Testing

In [11]:
get_confidence_intervals(dummy_classifier, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 0.69 with a 95% confidence interval of [0.65,0.73]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.84]
Median Accuracy: 0.69 with a 95% confidence interval of [0.65,0.73]
Median MCC: 0.00 with a 95% confidence interval of [0.00,0.00]


## Logistic Regression (LR)

In [12]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LogisticRegression(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 100,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [13]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=[
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['newton-cg', 'lbfgs', 'sag']),
                           'model__penalty': Categorical(['none', 'l2']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['liblinear']),
                           'model__penalty': Categorical(['l2', 'l1']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__l1_ratio': Real(0, 1),
                           'model__solver': Categorical(['saga']),
                           'model__penalty': Categorical(['none', 'l2', 'l1', 'elasticnet']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                      ],
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [14]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lr = model.best_estimator_
#
# y_train_pred = optimised_lr.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lr, 'Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results", model.cv_results_)

In [15]:
# Load Model
optimised_lr = load('Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')

In [16]:
get_confidence_intervals(optimised_lr, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.91,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.86,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.79,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.42,0.55]


In [17]:
optimised_lr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LogisticRegression(C=0.08691067180260512, max_iter=4439, random_state=42,
                      solver='liblinear'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(C=0.08691067180260512, max_iter=4439, random_state=42,
                    solver='liblinear'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.08691067180260512,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 4439,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'liblinear',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [18]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results.npy", allow_pickle=True).tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
logistic_regression_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__max_iter,param_model__penalty,param_model__solver,param_model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
66,55.793100,2.523845,0.132408,0.043119,0.086911,,4439,l2,liblinear,,"{'model__C': 0.08691067180260512, 'model__clas...",0.881027,0.879426,0.882736,0.881267,0.880858,0.881063,0.001055,1
26,2815.406966,2.884060,0.092576,0.014220,0.172047,,5000,l2,sag,,"{'model__C': 0.17204723798386953, 'model__clas...",0.881012,0.879418,0.882679,0.881265,0.880830,0.881041,0.001040,2
110,99.473659,44.479059,0.077112,0.012028,0.458579,,3710,l1,saga,0.709831,"{'model__C': 0.4585792720221234, 'model__class...",0.880919,0.879570,0.882601,0.881239,0.880801,0.881026,0.000970,3
70,249.368894,19.934678,0.112724,0.023588,0.276555,,50,l1,liblinear,,"{'model__C': 0.2765547591820871, 'model__class...",0.880999,0.879606,0.882495,0.881154,0.880868,0.881024,0.000918,4
148,27.944436,0.060402,0.090997,0.005039,2.752098,,50,l2,saga,0.0,"{'model__C': 2.7520978985242417, 'model__class...",0.880781,0.879539,0.882721,0.881100,0.880907,0.881010,0.001016,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,5.015475,0.039212,0.204767,0.003667,0.000002,,1273,l2,liblinear,,"{'model__C': 2.195551082864074e-06, 'model__cl...",0.738634,0.740038,0.748189,0.743261,0.738875,0.741799,0.003595,146
67,3.546709,0.147501,0.118602,0.020674,0.000001,balanced,1764,l2,liblinear,,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.693792,0.693219,0.703228,0.698660,0.692990,0.696378,0.004007,147
101,2.940660,0.183646,0.072081,0.006645,0.000016,balanced,4596,l1,saga,0.473675,"{'model__C': 1.6285217534593228e-05, 'model__c...",0.000000,0.848705,0.848705,0.848672,0.000000,0.509217,0.415774,148
86,2.425969,0.030663,0.182416,0.004853,0.000009,balanced,4473,l1,liblinear,,"{'model__C': 9.132486222789627e-06, 'model__cl...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,149


### Testing

In [19]:
get_confidence_intervals(optimised_lr, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.86 with a 95% confidence interval of [0.83,0.90]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.82 with a 95% confidence interval of [0.78,0.85]
Median Accuracy: 0.73 with a 95% confidence interval of [0.69,0.77]
Median MCC: 0.32 with a 95% confidence interval of [0.23,0.42]


## Linear Support Vector Classification (LSVC)

In [20]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVC(random_state=42, penalty='l2'))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVC(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'squared_hinge',
 'model__max_iter': 1000,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [21]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={'model__loss': Categorical(['hinge', 'squared_hinge']),
                                     'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                                     'model__class_weight': Categorical([None, "balanced"]),
                                     'model__max_iter': Integer(500, 5000)},
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [22]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvc = model.best_estimator_
#
# y_train_pred = optimised_lsvc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvc, 'Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", model.cv_results_)

In [23]:
# Load Model
optimised_lsvc = load('Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')

In [24]:
get_confidence_intervals(optimised_lsvc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.94 with a 95% confidence interval of [0.92,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.87,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.79,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.43,0.55]


In [25]:
optimised_lsvc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LinearSVC(C=0.1380447014995764, loss='hinge', max_iter=3709, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(C=0.1380447014995764, loss='hinge', max_iter=3709, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.1380447014995764,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'hinge',
 'model__max_iter': 3709,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [26]:
lsvc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", allow_pickle=True).tolist())
lsvc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
36,118.052291,1.109156,0.267848,0.035558,0.138045,,hinge,3709,"{'model__C': 0.1380447014995764, 'model__class...",0.881801,0.882608,0.88287,0.882889,0.88374,0.882782,0.000621,1
27,68.849699,0.87966,0.278061,0.048183,0.07581,,hinge,2099,"{'model__C': 0.07580988177176091, 'model__clas...",0.882007,0.882444,0.882729,0.882889,0.883684,0.88275,0.000554,2
47,26.869208,0.263102,0.110933,0.018699,0.045751,,hinge,3711,"{'model__C': 0.045751052954376946, 'model__cla...",0.882002,0.882623,0.88273,0.88279,0.883463,0.882722,0.000465,3
35,141.892981,1.392536,0.277655,0.025748,0.172145,,hinge,4946,"{'model__C': 0.1721453720296977, 'model__class...",0.881786,0.882472,0.882821,0.882866,0.883576,0.882704,0.000583,4
37,42.51237,0.342727,0.353301,0.097358,0.052407,,hinge,500,"{'model__C': 0.052407004122209624, 'model__cla...",0.88179,0.882557,0.882779,0.882911,0.883441,0.882696,0.000538,5
44,169.83109,1.389224,0.118468,0.02009,0.538708,,hinge,5000,"{'model__C': 0.5387084648932836, 'model__class...",0.881716,0.882493,0.882793,0.882941,0.883478,0.882684,0.00058,6
11,296.117881,2.094355,0.175879,0.038099,0.743033,,hinge,4770,"{'model__C': 0.7430332959877312, 'model__class...",0.881729,0.882506,0.882689,0.882601,0.883685,0.882642,0.000623,7
14,28.360347,0.292943,0.18714,0.035945,0.032194,,hinge,5000,"{'model__C': 0.03219377091560028, 'model__clas...",0.881711,0.882211,0.882432,0.88282,0.883478,0.882531,0.000594,8
28,25.389982,0.158025,0.332762,0.015867,0.014692,,hinge,2867,"{'model__C': 0.01469227089101338, 'model__clas...",0.881939,0.882255,0.8821,0.882593,0.883252,0.882428,0.000465,9
15,13.52466,0.224726,0.160166,0.035083,0.013968,,hinge,513,"{'model__C': 0.013967868241974836, 'model__cla...",0.881718,0.882234,0.882013,0.88268,0.88321,0.882371,0.000524,10


### Testing

In [27]:
get_confidence_intervals(optimised_lsvc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.87 with a 95% confidence interval of [0.84,0.91]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.85]
Median Accuracy: 0.73 with a 95% confidence interval of [0.70,0.77]
Median MCC: 0.33 with a 95% confidence interval of [0.24,0.42]


## K-Nearest Neighbors Classifier (KNNC)

In [28]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsClassifier())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [29]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_neighbors': Integer(4, 20),
                       'model__weights': Categorical(['uniform', 'distance']),
                       'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [30]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnc = model.best_estimator_
#
# y_train_pred = optimised_knnc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnc, 'Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", model.cv_results_)

In [31]:
# Load Model
optimised_knnc = load('Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')

In [32]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_train_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Classification/optimised_knnc_train_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnc, X_train, y_train, 1000, "Classification", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [33]:
optimised_knnc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   KNeighborsClassifier(algorithm='kd_tree', n_neighbors=6, weights='distance'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=6, weights='distance'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'kd_tree',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 6,
 'model__p': 2,
 'model__weights': 'distance'}

In [34]:
knnc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", allow_pickle=True).tolist())
knnc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__algorithm,param_model__n_neighbors,param_model__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,13.633698,0.141628,1182.34496,9.960524,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
15,12.282033,0.163549,1002.542356,9.76732,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
43,22.050243,0.19356,1119.140156,9.1183,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
45,14.069371,0.09546,1210.372202,13.322948,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
44,1.731142,0.010747,128.236136,2.612396,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
31,1.4512,0.009786,122.137484,1.565822,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
33,3.836397,0.079219,276.719662,3.997535,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
35,28.715279,0.242853,2172.784967,22.308,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
47,1.568193,0.007192,129.0019,1.864633,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
36,3.547066,0.060361,253.647092,3.554642,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1


### Testing

In [35]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_test_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Classification/optimised_knnc_test_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnc, X_test, y_test, 500, "Classification", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.83 with a 95% confidence interval of [0.79,0.87]
Median Precision: 0.82 with a 95% confidence interval of [0.77,0.85]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.85]
Median Accuracy: 0.75 with a 95% confidence interval of [0.71,0.79]
Median MCC: 0.42 with a 95% confidence interval of [0.33,0.50]


## Decision Tree Classifier (DTC)

In [36]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'best'}

In [37]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__criterion': Categorical(['gini', 'entropy']),
                       'model__splitter': Categorical(['best', 'random']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [38]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_dtc = model.best_estimator_
#
# y_train_pred = optimised_dtc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_dtc, 'Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", model.cv_results_)

In [39]:
# Load Model
optimised_dtc = load('Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')

In [40]:
# visualise_decision_tree(optimised_dtc['model'], feature_selection_columns, ["Inactive", "Active"],
#                         "Dataset_Files/Baseline_Models/Classification/optimised_dtc.dot")

In [41]:
get_confidence_intervals(optimised_dtc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [42]:
optimised_dtc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                          random_state=42, splitter='random'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                        random_state=42, splitter='random'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': 'balanced',
 'model__criterion': 'entropy',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'random'}

In [43]:
dtc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", allow_pickle=True).tolist())
dtc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,13.174896,0.330564,0.083181,0.0133,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
23,13.090956,0.333569,0.078187,0.019857,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
48,13.090186,0.241431,0.079286,0.011067,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
11,13.126391,0.323209,0.074224,0.011057,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
27,13.251386,0.259169,0.072245,0.012244,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
31,13.038817,0.34591,0.078129,0.009886,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
19,13.154575,0.344138,0.082861,0.011696,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
33,13.309156,0.329549,0.084524,0.015882,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
36,13.511728,0.310206,0.079706,0.015926,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
37,12.940933,0.296501,0.077887,0.010588,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1


### Testing

In [44]:
get_confidence_intervals(optimised_dtc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.71 with a 95% confidence interval of [0.67,0.76]
Median Precision: 0.75 with a 95% confidence interval of [0.71,0.80]
Median F1: 0.73 with a 95% confidence interval of [0.69,0.77]
Median Accuracy: 0.64 with a 95% confidence interval of [0.60,0.68]
Median MCC: 0.18 with a 95% confidence interval of [0.09,0.27]


## Random Forest Classifier (RFC)

In [45]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [46]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_estimators': Integer(100, 800),
                       'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced', 'balanced_subsample'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [47]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_rfc = model.best_estimator_
#
# y_train_pred = optimised_rfc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_rfc, 'Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", model.cv_results_)

In [48]:
# Load Model
optimised_rfc = load('Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')

In [49]:
get_confidence_intervals(optimised_rfc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [50]:
optimised_rfc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   RandomForestClassifier(criterion='entropy', max_features=None, n_estimators=799,
                          random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(criterion='entropy', max_features=None, n_estimators=799,
                        random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'entropy',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 799,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [51]:
rfc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", allow_pickle=True).tolist())
rfc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,16898.942848,81.928433,5.966139,0.794394,,entropy,,799,"{'model__class_weight': None, 'model__criterio...",0.949162,0.94631,0.948151,0.949253,0.947203,0.948016,0.001134,1
49,14031.461306,49.547133,5.234502,0.618534,,log_loss,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
19,15946.9724,127.733869,12.078268,2.661393,,log_loss,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
44,13844.117221,59.094871,5.841689,0.715712,,log_loss,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
23,20453.466873,62.563594,6.860888,1.153252,,log_loss,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
16,14270.508583,58.413429,5.24701,0.617883,,log_loss,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
48,18277.824007,73.331706,6.267198,1.122905,,entropy,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
39,13702.078298,73.717344,5.111344,0.622625,,entropy,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
45,15396.591052,58.853851,6.315138,1.347216,,entropy,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2
31,13564.366297,52.906671,5.246218,0.801017,,entropy,,800,"{'model__class_weight': None, 'model__criterio...",0.949127,0.94617,0.948214,0.94925,0.947126,0.947977,0.001182,2


### Testing

In [52]:
get_confidence_intervals(optimised_rfc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.90,0.96]
Median Precision: 0.76 with a 95% confidence interval of [0.72,0.80]
Median F1: 0.84 with a 95% confidence interval of [0.81,0.86]
Median Accuracy: 0.75 with a 95% confidence interval of [0.71,0.79]
Median MCC: 0.35 with a 95% confidence interval of [0.26,0.44]


## Stochastic Gradient Descent Classifier (SGDC)

In [53]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'optimal',
 'model__loss': 'hinge',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [54]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__loss': Categorical(
                          ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error',
                           'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                          'model__eta0': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__class_weight': Categorical([None, 'balanced'])
                      },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [55]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_sgdc = model.best_estimator_
#
# y_train_pred = optimised_sgdc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_sgdc, 'Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", model.cv_results_)

In [56]:
# Load Model
optimised_sgdc = load('Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')

In [57]:
get_confidence_intervals(optimised_sgdc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.91,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.87,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.79,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.43,0.55]


In [58]:
optimised_sgdc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
                 loss='log_loss', penalty='elasticnet', random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
               loss='log_loss', penalty='elasticnet', random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 1e-06,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0009866506104658564,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'adaptive',
 'model__loss': 'log_loss',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'elasticnet',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': Tru

In [59]:
sgdc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", allow_pickle=True).tolist())
sgdc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__class_weight,param_model__eta0,param_model__learning_rate,param_model__loss,param_model__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
42,11.371536,0.18708,0.120313,0.014446,1e-06,,0.000987,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.881067,0.879696,0.882391,0.880865,0.880873,0.880978,0.000857,1
43,9.930557,0.116561,0.122369,0.01475,1e-06,,0.000494,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.881225,0.879568,0.882046,0.880833,0.880974,0.880929,0.0008,2
11,12.323203,0.777626,0.09029,0.014649,1e-06,,0.000727,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879445,0.8802,0.882,0.880759,0.881322,0.880745,0.000882,3
21,11.653906,0.736498,0.090107,0.008717,1e-06,,0.000657,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879345,0.880241,0.881935,0.880702,0.881422,0.880729,0.000904,4
14,21.720275,1.68385,0.086799,0.011324,1e-06,,0.034326,adaptive,modified_huber,elasticnet,"{'model__alpha': 1.208433613059891e-06, 'model...",0.879728,0.879959,0.882158,0.880658,0.880842,0.880669,0.000853,5
29,22.164678,1.160952,0.098143,0.01087,1e-06,,0.076401,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879402,0.880403,0.882088,0.880064,0.880907,0.880573,0.000902,6
22,9.552707,0.206343,0.110204,0.004911,1e-06,,0.000197,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.880972,0.879777,0.881438,0.880232,0.880146,0.880513,0.000603,7
35,7.110788,0.163579,0.105801,0.019669,0.000156,,1.9e-05,adaptive,modified_huber,elasticnet,"{'model__alpha': 0.0001558896341561995, 'model...",0.880213,0.879695,0.881146,0.880475,0.880904,0.880487,0.000512,8
47,17.977861,1.378186,0.097952,0.012883,0.000261,,0.001169,adaptive,squared_hinge,l1,"{'model__alpha': 0.00026050658160080433, 'mode...",0.879344,0.880223,0.881504,0.880303,0.880417,0.880358,0.000688,9
38,7.497112,0.159908,0.116463,0.017065,2.3e-05,,1.4e-05,adaptive,modified_huber,elasticnet,"{'model__alpha': 2.3185490575738482e-05, 'mode...",0.880143,0.879849,0.88083,0.879818,0.880419,0.880212,0.000379,10


### Testing

In [60]:
get_confidence_intervals(optimised_sgdc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.86 with a 95% confidence interval of [0.83,0.90]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.81 with a 95% confidence interval of [0.78,0.84]
Median Accuracy: 0.73 with a 95% confidence interval of [0.69,0.77]
Median MCC: 0.33 with a 95% confidence interval of [0.24,0.41]
