# Imports

In [1]:
# General Imports
from models_utils import *

# Classification Models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Training & Test Sets

In [2]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [3]:
X_train = load_from_pickle("Training_Test_Sets/Classification/X_train_feature_selection")
X_train.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Classification/y_train")
y_train = y_train.to_numpy()

In [4]:
X_test = load_from_pickle("Training_Test_Sets/Classification/X_test_feature_selection")
X_test.drop(columns=["Drug_CID", "Protein_Accession"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Classification/y_test")
y_test = y_test.to_numpy()

In [5]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train[y_train == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train[y_train == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test[y_test == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test[y_test == 0].shape[0]})")

X_train shape: (99705, 388)
y_train shape: 99705 (Binding Count: 73498, Non-Binding Count: 26207)
X_test shape: (816, 388)
y_test shape: 816 (Binding Count: 563, Non-Binding Count: 253)


# Model Training & Testing

In [6]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Classifier (DC)

In [7]:
dummy_classifier = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyClassifier(random_state=42))
    ]
)
dummy_classifier.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DummyClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__random_state': 42,
 'model__strategy': 'prior'}

### Training

In [8]:
# dummy_classifier.fit(X_train, y_train)
#
# y_train_pred = dummy_classifier.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_classifier, 'Dataset_Files/Baseline_Models/Classification/dc.joblib')

In [9]:
# Load Model
dummy_classifier = load('Dataset_Files/Baseline_Models/Classification/dc.joblib')

In [10]:
get_confidence_intervals(dummy_classifier, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 0.74 with a 95% confidence interval of [0.71,0.77]
Median F1: 0.85 with a 95% confidence interval of [0.83,0.87]
Median Accuracy: 0.74 with a 95% confidence interval of [0.71,0.77]
Median MCC: 0.00 with a 95% confidence interval of [0.00,0.00]


### Testing

In [11]:
get_confidence_intervals(dummy_classifier, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 0.69 with a 95% confidence interval of [0.65,0.73]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.84]
Median Accuracy: 0.69 with a 95% confidence interval of [0.65,0.73]
Median MCC: 0.00 with a 95% confidence interval of [0.00,0.00]


## Logistic Regression (LR)

In [12]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LogisticRegression(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 100,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [13]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=[
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['newton-cg', 'lbfgs', 'sag']),
                           'model__penalty': Categorical(['none', 'l2']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__solver': Categorical(['liblinear']),
                           'model__penalty': Categorical(['l2', 'l1']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                          {'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                           'model__l1_ratio': Real(0, 1),
                           'model__solver': Categorical(['saga']),
                           'model__penalty': Categorical(['none', 'l2', 'l1', 'elasticnet']),
                           'model__max_iter': Integer(50, 5000),
                           'model__class_weight': Categorical([None, "balanced"])},
                      ],
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [14]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lr = model.best_estimator_
#
# y_train_pred = optimised_lr.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lr, 'Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results", model.cv_results_)

In [15]:
# Load Model
optimised_lr = load('Dataset_Files/Baseline_Models/Classification/optimised_lr.joblib')

In [16]:
get_confidence_intervals(optimised_lr, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.91,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.87,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.79,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.43,0.55]


In [17]:
optimised_lr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LogisticRegression(C=0.0005922659193283465, max_iter=86, penalty='none',
                      random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(C=0.0005922659193283465, max_iter=86, penalty='none',
                    random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.0005922659193283465,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 86,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'none',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [18]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lr_cv_results.npy", allow_pickle=True).tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
logistic_regression_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__max_iter,param_model__penalty,param_model__solver,param_model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
18,9.967149,0.222064,0.176222,0.045951,0.000592,,86,none,lbfgs,,"{'model__C': 0.0005922659193283465, 'model__cl...",0.880807,0.879686,0.882692,0.881077,0.881144,0.881081,0.000962,1
66,64.880265,2.561403,0.351809,0.056933,0.086874,,4439,l2,liblinear,,"{'model__C': 0.08687434218275818, 'model__clas...",0.881027,0.879426,0.882736,0.881303,0.880858,0.881070,0.001056,2
92,288.682903,23.192366,0.149290,0.068380,0.27837,,50,l1,liblinear,,"{'model__C': 0.27837010361065284, 'model__clas...",0.880999,0.879634,0.882495,0.881154,0.880868,0.881030,0.000910,3
110,96.145776,43.485431,0.154168,0.101802,0.458579,,3710,l1,saga,0.709831,"{'model__C': 0.4585792720221234, 'model__class...",0.880919,0.879570,0.882601,0.881239,0.880801,0.881026,0.000970,4
90,186.786358,38.790002,0.095461,0.023325,0.171825,,50,l1,liblinear,,"{'model__C': 0.17182546055480577, 'model__clas...",0.880960,0.879882,0.882162,0.881228,0.880896,0.881025,0.000730,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,8.026151,1.485571,0.094766,0.025165,0.000001,balanced,3719,l2,newton-cg,,"{'model__C': 1.0691593986059968e-06, 'model__c...",0.743515,0.744951,0.754247,0.746565,0.743345,0.746525,0.004032,146
149,8.736954,0.187918,0.068754,0.015932,0.000001,balanced,3568,l2,saga,0.141119,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.743009,0.744762,0.754049,0.746680,0.742964,0.746293,0.004111,147
50,4.333323,0.046542,0.079652,0.010487,0.000002,,1273,l2,liblinear,,"{'model__C': 2.195551082864074e-06, 'model__cl...",0.738634,0.740038,0.748189,0.743261,0.738875,0.741799,0.003595,148
67,2.993152,0.281752,0.170432,0.094420,0.000001,balanced,1764,l2,liblinear,,"{'model__C': 1e-06, 'model__class_weight': 'ba...",0.693792,0.693219,0.703228,0.698660,0.692990,0.696378,0.004007,149


### Testing

In [19]:
get_confidence_intervals(optimised_lr, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.86 with a 95% confidence interval of [0.82,0.90]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.81 with a 95% confidence interval of [0.78,0.85]
Median Accuracy: 0.73 with a 95% confidence interval of [0.69,0.77]
Median MCC: 0.33 with a 95% confidence interval of [0.24,0.42]


## Linear Support Vector Classification (LSVC)

In [20]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVC(random_state=42, penalty='l2'))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVC(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'squared_hinge',
 'model__max_iter': 1000,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [21]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={'model__loss': Categorical(['hinge', 'squared_hinge']),
                                     'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                                     'model__class_weight': Categorical([None, "balanced"]),
                                     'model__max_iter': Integer(500, 5000)},
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [22]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvc = model.best_estimator_
#
# y_train_pred = optimised_lsvc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvc, 'Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", model.cv_results_)

In [23]:
# Load Model
optimised_lsvc = load('Dataset_Files/Baseline_Models/Classification/optimised_lsvc.joblib')

In [24]:
get_confidence_intervals(optimised_lsvc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.94 with a 95% confidence interval of [0.92,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.87,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.80,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.43,0.55]


In [25]:
optimised_lsvc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LinearSVC(C=0.1382147549119633, loss='hinge', max_iter=3708, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVC(C=0.1382147549119633, loss='hinge', max_iter=3708, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.1382147549119633,
 'model__class_weight': None,
 'model__dual': True,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__loss': 'hinge',
 'model__max_iter': 3708,
 'model__multi_class': 'ovr',
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [26]:
lsvc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_lsvc_cv_results.npy", allow_pickle=True).tolist())
lsvc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
36,56.176328,0.590413,0.09011,0.012284,0.138215,,hinge,3708,"{'model__C': 0.1382147549119633, 'model__class...",0.88183,0.8826,0.882878,0.882838,0.883748,0.882779,0.000614,1
27,33.806663,0.289807,0.105785,0.015044,0.07562,,hinge,2098,"{'model__C': 0.0756195398834118, 'model__class...",0.881937,0.882564,0.882772,0.882911,0.883591,0.882755,0.000535,2
49,51.403866,0.477602,0.092906,0.014288,0.118062,,hinge,5000,"{'model__C': 0.11806153951092785, 'model__clas...",0.882079,0.882544,0.882685,0.88286,0.883555,0.882745,0.000481,3
45,25.310576,0.19831,0.096962,0.011774,0.046281,,hinge,3624,"{'model__C': 0.0462806124078546, 'model__class...",0.881909,0.882594,0.882723,0.882769,0.883563,0.882712,0.000527,4
35,67.97047,0.654737,0.100172,0.007908,0.172145,,hinge,4946,"{'model__C': 0.1721453720296977, 'model__class...",0.881786,0.882472,0.882821,0.882866,0.883576,0.882704,0.000583,5
47,139.493681,1.239127,0.093835,0.017263,0.435116,,hinge,5000,"{'model__C': 0.43511575726811647, 'model__clas...",0.88158,0.882829,0.882827,0.882883,0.883382,0.8827,0.000598,6
37,19.985416,0.191292,0.106314,0.030365,0.052798,,hinge,500,"{'model__C': 0.052797956980008054, 'model__cla...",0.881863,0.882568,0.882898,0.882849,0.883263,0.882688,0.000468,7
11,203.923365,1.299732,0.087499,0.012495,0.743033,,hinge,4770,"{'model__C': 0.7430332957496445, 'model__class...",0.881729,0.882506,0.882689,0.882601,0.883685,0.882642,0.000623,8
48,16.388446,0.204221,0.110348,0.02217,0.029429,,hinge,1428,"{'model__C': 0.02942902337296776, 'model__clas...",0.881718,0.882376,0.882496,0.882735,0.883521,0.882569,0.000583,9
14,21.256865,0.154275,0.139289,0.033784,0.032193,,hinge,5000,"{'model__C': 0.03219332465407559, 'model__clas...",0.881767,0.88224,0.882496,0.882813,0.883506,0.882565,0.000582,10


### Testing

In [27]:
get_confidence_intervals(optimised_lsvc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.88 with a 95% confidence interval of [0.84,0.91]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.85]
Median Accuracy: 0.74 with a 95% confidence interval of [0.69,0.77]
Median MCC: 0.33 with a 95% confidence interval of [0.23,0.42]


## K-Nearest Neighbors Classifier (KNNC)

In [28]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsClassifier())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [29]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_neighbors': Integer(4, 20),
                       'model__weights': Categorical(['uniform', 'distance']),
                       'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [30]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnc = model.best_estimator_
#
# y_train_pred = optimised_knnc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnc, 'Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", model.cv_results_)

In [31]:
# Load Model
optimised_knnc = load('Dataset_Files/Baseline_Models/Classification/optimised_knnc.joblib')

In [32]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_train_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Classification/optimised_knnc_train_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnc, X_train, y_train, 1000, "Classification", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [33]:
optimised_knnc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   KNeighborsClassifier(algorithm='kd_tree', n_neighbors=6, weights='distance'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=6, weights='distance'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'kd_tree',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 6,
 'model__p': 2,
 'model__weights': 'distance'}

In [34]:
knnc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_knnc_cv_results.npy", allow_pickle=True).tolist())
knnc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__algorithm,param_model__n_neighbors,param_model__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,3.604291,0.097547,757.73786,25.242931,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
15,3.629958,0.080214,709.126471,12.281687,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
33,0.802446,0.118402,9.522272,0.100709,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
35,3.610951,0.094378,750.344408,41.719547,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
36,0.837429,0.040939,9.372987,0.132019,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
31,0.800072,0.107493,9.454483,0.084098,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
43,3.604221,0.117799,759.601214,13.977584,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
44,0.834071,0.053298,9.468205,0.066967,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
45,3.643211,0.050724,708.767467,15.428243,kd_tree,6,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1
47,0.786727,0.089076,9.471032,0.045987,auto,6,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.93927,0.935738,0.938946,0.939656,0.938927,0.938507,0.00141,1


### Testing

In [35]:
if os.path.exists("Dataset_Files/Baseline_Models/Classification/optimised_knnc_test_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Classification/optimised_knnc_test_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnc, X_test, y_test, 500, "Classification", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.83 with a 95% confidence interval of [0.79,0.87]
Median Precision: 0.82 with a 95% confidence interval of [0.78,0.86]
Median F1: 0.82 with a 95% confidence interval of [0.79,0.85]
Median Accuracy: 0.76 with a 95% confidence interval of [0.72,0.79]
Median MCC: 0.42 with a 95% confidence interval of [0.33,0.50]


## Decision Tree Classifier (DTC)

In [36]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'best'}

In [37]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__criterion': Categorical(['gini', 'entropy']),
                       'model__splitter': Categorical(['best', 'random']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [38]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_dtc = model.best_estimator_
#
# y_train_pred = optimised_dtc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_dtc, 'Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", model.cv_results_)

In [39]:
# Load Model
optimised_dtc = load('Dataset_Files/Baseline_Models/Classification/optimised_dtc.joblib')

In [40]:
# visualise_decision_tree(optimised_dtc['model'], feature_selection_columns, ["Inactive", "Active"],
#                         "Dataset_Files/Baseline_Models/Classification/optimised_dtc.dot")

In [41]:
get_confidence_intervals(optimised_dtc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [42]:
optimised_dtc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                          random_state=42, splitter='random'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                        random_state=42, splitter='random'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': 'balanced',
 'model__criterion': 'entropy',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 42,
 'model__splitter': 'random'}

In [43]:
dtc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_dtc_cv_results.npy", allow_pickle=True).tolist())
dtc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,14.322085,0.273997,0.069076,0.006317,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
23,16.159878,0.382061,0.081241,0.015318,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
48,20.176426,0.48379,0.10682,0.040917,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
11,14.677525,0.285513,0.084819,0.011418,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
27,13.145007,0.232202,0.065607,0.006246,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
31,13.524995,0.243572,0.06564,0.011702,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
19,13.936948,0.34142,0.079766,0.007252,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
33,12.989052,0.375393,0.078204,0.017049,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
36,13.206737,0.285128,0.071863,0.018745,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1
37,13.683733,0.31193,0.067947,0.008435,balanced,entropy,,random,"{'model__class_weight': 'balanced', 'model__cr...",0.92164,0.920335,0.923301,0.921825,0.922775,0.921975,0.001022,1


### Testing

In [44]:
get_confidence_intervals(optimised_dtc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.72 with a 95% confidence interval of [0.67,0.76]
Median Precision: 0.75 with a 95% confidence interval of [0.71,0.80]
Median F1: 0.73 with a 95% confidence interval of [0.70,0.77]
Median Accuracy: 0.64 with a 95% confidence interval of [0.60,0.68]
Median MCC: 0.18 with a 95% confidence interval of [0.09,0.27]


## Random Forest Classifier (RFC)

In [45]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__maxBins': 256,
 'model__max_depth': None,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__minBinSize': 1,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [46]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__n_estimators': Integer(100, 1000),
                       'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
                       'model__max_features': Categorical([None, 'sqrt', 'log2']),
                       'model__class_weight': Categorical([None, 'balanced', 'balanced_subsample'])
                       },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [47]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_rfc = model.best_estimator_
#
# y_train_pred = optimised_rfc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_rfc, 'Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", model.cv_results_)

In [48]:
# Load Model
optimised_rfc = load('Dataset_Files/Baseline_Models/Classification/optimised_rfc.joblib')

In [49]:
get_confidence_intervals(optimised_rfc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Precision: 1.00 with a 95% confidence interval of [1.00,1.00]
Median F1: 1.00 with a 95% confidence interval of [1.00,1.00]
Median Accuracy: 1.00 with a 95% confidence interval of [1.00,1.00]
Median MCC: 1.00 with a 95% confidence interval of [1.00,1.00]


In [50]:
optimised_rfc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   RandomForestClassifier(criterion='log_loss', max_features=None,
                          n_estimators=1000, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(criterion='log_loss', max_features=None,
                        n_estimators=1000, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'log_loss',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 1000,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [51]:
rfc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_rfc_cv_results.npy", allow_pickle=True).tolist())
rfc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__class_weight,param_model__criterion,param_model__max_features,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,7956.526073,62.811718,4.496517,0.206688,,entropy,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
48,8644.882016,60.02643,4.492718,0.051159,,entropy,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
44,8033.716068,62.966427,6.483778,0.105352,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
16,7329.961614,60.161373,4.258674,0.169524,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
45,9345.696466,32.081029,6.404726,0.066233,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
27,8113.213372,50.627227,4.478413,0.085655,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
31,7787.961137,34.722299,4.840943,0.366981,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
47,9225.737369,65.444442,6.458594,0.120409,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
23,8650.65494,87.274675,5.811025,0.17288,,entropy,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1
36,7692.423509,73.687519,4.647925,0.283529,,log_loss,,1000,"{'model__class_weight': None, 'model__criterio...",0.94906,0.946107,0.948579,0.949584,0.947003,0.948067,0.001306,1


### Testing

In [52]:
get_confidence_intervals(optimised_rfc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.90,0.96]
Median Precision: 0.76 with a 95% confidence interval of [0.72,0.80]
Median F1: 0.84 with a 95% confidence interval of [0.81,0.86]
Median Accuracy: 0.75 with a 95% confidence interval of [0.72,0.79]
Median MCC: 0.36 with a 95% confidence interval of [0.26,0.44]


## Stochastic Gradient Descent Classifier (SGDC)

In [53]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'optimal',
 'model__loss': 'hinge',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [54]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces=
                      {'model__loss': Categorical(
                          ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error',
                           'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                          'model__eta0': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__class_weight': Categorical([None, 'balanced'])
                      },
                      scoring='f1',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [55]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_sgdc = model.best_estimator_
#
# y_train_pred = optimised_sgdc.predict(X_train)
# calculate_metrics_classification(y_train, y_train_pred)
#
# # Save Model
# dump(optimised_sgdc, 'Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')
# np.save("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", model.cv_results_)

In [56]:
# Load Model
optimised_sgdc = load('Dataset_Files/Baseline_Models/Classification/optimised_sgdc.joblib')

In [57]:
get_confidence_intervals(optimised_sgdc, X_train, y_train, 1000, "Classification")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Recall: 0.93 with a 95% confidence interval of [0.91,0.95]
Median Precision: 0.84 with a 95% confidence interval of [0.81,0.86]
Median F1: 0.88 with a 95% confidence interval of [0.87,0.90]
Median Accuracy: 0.82 with a 95% confidence interval of [0.79,0.84]
Median MCC: 0.49 with a 95% confidence interval of [0.42,0.55]


In [58]:
optimised_sgdc.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
                 loss='log_loss', penalty='elasticnet', random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDClassifier(alpha=1e-06, eta0=0.0009866506104658564, learning_rate='adaptive',
               loss='log_loss', penalty='elasticnet', random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 1e-06,
 'model__average': False,
 'model__class_weight': None,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.0009866506104658564,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'adaptive',
 'model__loss': 'log_loss',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__n_jobs': None,
 'model__penalty': 'elasticnet',
 'model__power_t': 0.5,
 'model__random_state': 42,
 'model__shuffle': Tru

In [59]:
sgdc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Classification/optimised_sgdc_cv_results.npy", allow_pickle=True).tolist())
sgdc_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdc_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__class_weight,param_model__eta0,param_model__learning_rate,param_model__loss,param_model__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
42,11.540089,0.135104,0.101924,0.012556,1e-06,,0.000987,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.881067,0.879696,0.882391,0.880865,0.880873,0.880978,0.000857,1
43,10.143243,0.087633,0.11511,0.020024,1e-06,,0.000494,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.881225,0.879568,0.882046,0.880833,0.880974,0.880929,0.0008,2
11,13.461571,0.712058,0.095239,0.012657,1e-06,,0.000727,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879445,0.8802,0.882,0.880759,0.881322,0.880745,0.000882,3
21,12.815429,0.754772,0.094556,0.020193,1e-06,,0.000657,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879345,0.880241,0.881935,0.880702,0.881422,0.880729,0.000904,4
14,22.981915,1.521373,0.084933,0.014329,1e-06,,0.034326,adaptive,modified_huber,elasticnet,"{'model__alpha': 1.208433613059891e-06, 'model...",0.879728,0.879959,0.882158,0.880658,0.880842,0.880669,0.000853,5
29,22.162114,1.094737,0.090007,0.016814,1e-06,,0.076401,adaptive,modified_huber,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.879402,0.880403,0.882088,0.880064,0.880907,0.880573,0.000902,6
22,10.612993,0.127675,0.118659,0.011845,1e-06,,0.000197,adaptive,log_loss,elasticnet,"{'model__alpha': 1e-06, 'model__class_weight':...",0.880972,0.879777,0.881438,0.880232,0.880146,0.880513,0.000603,7
35,7.129987,0.202482,0.106182,0.010396,0.000156,,1.9e-05,adaptive,modified_huber,elasticnet,"{'model__alpha': 0.0001558896341561995, 'model...",0.880213,0.879695,0.881146,0.880475,0.880904,0.880487,0.000512,8
47,16.977139,1.366065,0.087503,0.012953,0.000261,,0.001169,adaptive,squared_hinge,l1,"{'model__alpha': 0.00026050658160080433, 'mode...",0.879344,0.880223,0.881504,0.880303,0.880417,0.880358,0.000688,9
38,7.647229,0.10121,0.107968,0.01321,2.3e-05,,1.4e-05,adaptive,modified_huber,elasticnet,"{'model__alpha': 2.3185490575738482e-05, 'mode...",0.880143,0.879849,0.88083,0.879818,0.880419,0.880212,0.000379,10


### Testing

In [60]:
get_confidence_intervals(optimised_sgdc, X_test, y_test, 500, "Classification")

Metrics after 1000 bootstrapped samples of size 500
--------------------------------------------------------
Median Recall: 0.86 with a 95% confidence interval of [0.83,0.90]
Median Precision: 0.77 with a 95% confidence interval of [0.73,0.81]
Median F1: 0.81 with a 95% confidence interval of [0.78,0.84]
Median Accuracy: 0.73 with a 95% confidence interval of [0.69,0.77]
Median MCC: 0.32 with a 95% confidence interval of [0.23,0.42]
