# Imports

In [1]:
# General Imports
from models_utils import *

# Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Training & Test Sets


In [2]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [3]:
X_train = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection")
X_train.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Regression/y_train")
y_train_binary = y_train.loc[:, "Activity_Binary"]
y_train.drop(columns=["Activity_Binary"], inplace=True)
y_train = y_train.to_numpy().flatten()

In [4]:
X_test = load_from_pickle("Training_Test_Sets/Regression/X_test_feature_selection")
X_test.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Regression/y_test")
y_test_binary = y_test.loc[:, "Activity_Binary"]
y_test.drop(columns=["Activity_Binary"], inplace=True)
y_test = y_test.to_numpy().flatten()

In [5]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train_binary[y_train_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train_binary[y_train_binary == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test_binary[y_test_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test_binary[y_test_binary == 0].shape[0]})")

X_train shape: (10956, 693)
y_train shape: 10956 (Binding Count: 3796, Non-Binding Count: 7160)
X_test shape: (102, 693)
y_test shape: 102 (Binding Count: 27, Non-Binding Count: 75)


# Model Training & Testing

In [6]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Regressor (DR)

In [7]:
dummy_regressor = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyRegressor()),
    ]
)
dummy_regressor.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', DummyRegressor())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyRegressor(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__quantile': None,
 'model__strategy': 'mean'}

### Training

In [8]:
# dummy_regressor.fit(X_train, y_train)
#
# y_train_pred = dummy_regressor.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_regressor, 'Dataset_Files/Baseline_Models/Regression/dr.joblib')

In [9]:
# Load Model
dummy_regressor = load('Dataset_Files/Baseline_Models/Regression/dr.joblib')

In [10]:
get_confidence_intervals(dummy_regressor, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -2.10 with a 95% confidence interval of [-2.21,-2.00]
Median R2: -0.00 with a 95% confidence interval of [-0.01,0.00]


### Testing

In [11]:
get_confidence_intervals(dummy_regressor, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -1.99 with a 95% confidence interval of [-2.35,-1.68]
Median R2: -0.03 with a 95% confidence interval of [-0.31,-0.00]


## Linear Regression (LR)

In [12]:
linear_regression = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearRegression(n_jobs=-1))
    ]
)
linear_regression.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LinearRegression(n_jobs=-1))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearRegression(n_jobs=-1),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__n_jobs': -1,
 'model__positive': False}

### Training

In [13]:
# linear_regression.fit(X_train, y_train)
#
# y_train_pred = linear_regression.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model
# dump(linear_regression, 'Dataset_Files/Baseline_Models/Regression/lr.joblib')

In [14]:
# Load Model
linear_regression = load('Dataset_Files/Baseline_Models/Regression/lr.joblib')

In [15]:
get_confidence_intervals(linear_regression, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -1.02 with a 95% confidence interval of [-1.10,-0.95]
Median R2: 0.64 with a 95% confidence interval of [0.58,0.69]


### Testing

In [16]:
get_confidence_intervals(linear_regression, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -3.71 with a 95% confidence interval of [-4.46,-2.96]
Median R2: -3.23 with a 95% confidence interval of [-7.21,-1.37]


## Linear Support Vector Regression (LSVR)

In [17]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVR(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVR(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVR(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__dual': True,
 'model__epsilon': 0.0,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1.0,
 'model__loss': 'epsilon_insensitive',
 'model__max_iter': 1000,
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [18]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__epsilon': Real(1e-6, 1e+2, prior='log-uniform'),
                          'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                          'model__loss': Categorical(['epsilon_insensitive','squared_epsilon_insensitive']),
                          'model__max_iter': Integer(500, 5000),
                      },
                      scoring='r2',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [19]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvr = model.best_estimator_
#
# y_train_pred = optimised_lsvr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvr, 'Dataset_Files/Baseline_Models/Regression/optimised_lsvr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_lsvr_cv_results.npy", model.cv_results_)

In [20]:
# Load Model
optimised_lsvr = load('Dataset_Files/Baseline_Models/Regression/optimised_lsvr.joblib')

In [21]:
get_confidence_intervals(optimised_lsvr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -1.08 with a 95% confidence interval of [-1.16,-1.00]
Median R2: 0.61 with a 95% confidence interval of [0.56,0.66]


In [22]:
optimised_lsvr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   LinearSVR(C=0.001851285810775567, epsilon=1.7367384227116365e-06,
             loss='squared_epsilon_insensitive', max_iter=5000, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVR(C=0.001851285810775567, epsilon=1.7367384227116365e-06,
           loss='squared_epsilon_insensitive', max_iter=5000, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 0.001851285810775567,
 'model__dual': True,
 'model__epsilon': 1.7367384227116365e-06,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1.0,
 'model__loss': 'squared_epsilon_insensitive',
 'model__max_iter': 5000,
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [23]:
lsvr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_lsvr_cv_results.npy", allow_pickle=True).tolist())
lsvr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__epsilon,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
47,2.047634,0.035547,0.023728,0.005731,0.001851,2e-06,squared_epsilon_insensitive,5000,"{'model__C': 0.001851285810775567, 'model__eps...",0.569444,0.555236,0.538904,0.541235,0.514056,0.543775,0.018459,1
30,2.933962,0.044377,0.022287,0.005709,0.003441,1e-06,squared_epsilon_insensitive,5000,"{'model__C': 0.003441089024486077, 'model__eps...",0.565942,0.557069,0.540705,0.540555,0.511874,0.543229,0.018461,2
33,1.780096,0.027043,0.024991,0.005911,0.001642,0.045062,squared_epsilon_insensitive,500,"{'model__C': 0.0016418132108535026, 'model__ep...",0.56916,0.553814,0.537688,0.540525,0.514422,0.543122,0.018178,3
49,3.408788,0.052016,0.028323,0.005199,0.00388,0.00682,squared_epsilon_insensitive,5000,"{'model__C': 0.003880449566530708, 'model__eps...",0.564935,0.557116,0.540788,0.540091,0.511363,0.542859,0.018409,4
27,4.213115,0.062348,0.030693,0.00636,0.003815,0.072315,squared_epsilon_insensitive,500,"{'model__C': 0.0038154220937046883, 'model__ep...",0.564574,0.556384,0.539825,0.53957,0.511745,0.542419,0.01812,5
41,4.505347,0.05274,0.021083,0.003512,0.004401,0.039398,squared_epsilon_insensitive,5000,"{'model__C': 0.004401485170681522, 'model__eps...",0.563658,0.55687,0.540478,0.539345,0.510952,0.542261,0.018238,6
35,3.965747,0.051097,0.021227,0.004543,0.004596,0.025702,squared_epsilon_insensitive,500,"{'model__C': 0.004595938550345414, 'model__eps...",0.563365,0.557008,0.540705,0.539271,0.510673,0.542204,0.018287,7
24,2.638353,0.043957,0.02382,0.00546,0.002613,0.179559,squared_epsilon_insensitive,500,"{'model__C': 0.0026132614577066555, 'model__ep...",0.56535,0.553418,0.537095,0.539002,0.51338,0.541649,0.017476,8
25,1.303724,0.033677,0.02351,0.004089,0.001259,0.097121,squared_epsilon_insensitive,5000,"{'model__C': 0.001259433299719621, 'model__eps...",0.568249,0.550509,0.534973,0.53848,0.514402,0.541323,0.017791,9
45,5.64371,0.055622,0.022717,0.004778,0.006337,3e-06,squared_epsilon_insensitive,5000,"{'model__C': 0.006337393415600314, 'model__eps...",0.560258,0.556981,0.541142,0.53778,0.50891,0.541014,0.018256,10


### Testing

In [24]:
get_confidence_intervals(optimised_lsvr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -2.34 with a 95% confidence interval of [-2.92,-1.88]
Median R2: -0.75 with a 95% confidence interval of [-2.19,-0.04]


## K-Nearest Neighbors Regressor (KNNR)

In [25]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsRegressor())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsRegressor())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsRegressor(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [26]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_neighbors': Integer(4, 20),
                          'model__weights': Categorical(['uniform', 'distance']),
                          'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                      },
                      scoring='r2',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [27]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnr = model.best_estimator_
#
# y_train_pred = optimised_knnr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnr, 'Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", model.cv_results_)

In [28]:
# Load Model
optimised_knnr = load('Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')

In [29]:
if os.path.exists("Dataset_Files/Baseline_Models/Regression/optimised_knnr_train_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Regression/optimised_knnr_train_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnr, X_train, y_train, 1000, "Regression", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.00 with a 95% confidence interval of [-0.00,-0.00]
Median R2: 1.00 with a 95% confidence interval of [1.00,1.00]


In [30]:
optimised_knnr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', KNeighborsRegressor(n_neighbors=7, weights='distance'))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsRegressor(n_neighbors=7, weights='distance'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 7,
 'model__p': 2,
 'model__weights': 'distance'}

In [31]:
knnr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", allow_pickle=True).tolist())
knnr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__algorithm,param_model__n_neighbors,param_model__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,0.320383,0.021621,0.86708,0.031814,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
47,0.299923,0.015475,0.987735,0.024667,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
43,0.347656,0.025942,0.907,0.026406,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
41,0.365783,0.009927,0.939825,0.008885,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
40,0.355596,0.011428,0.969432,0.029571,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
39,0.339694,0.007711,0.987721,0.026268,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
35,0.290706,0.024584,0.903966,0.016924,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
33,0.302821,0.032408,0.953549,0.019792,auto,7,distance,"{'model__algorithm': 'auto', 'model__n_neighbo...",0.612459,0.572485,0.579084,0.555146,0.55376,0.574587,0.02131,1
20,0.87929,0.034991,33.867321,0.466688,kd_tree,7,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.612443,0.572485,0.579047,0.555191,0.55376,0.574585,0.021295,9
30,0.917832,0.062593,35.871758,0.20249,kd_tree,7,distance,"{'model__algorithm': 'kd_tree', 'model__n_neig...",0.612443,0.572485,0.579047,0.555191,0.55376,0.574585,0.021295,9


### Testing

In [32]:
if os.path.exists("Dataset_Files/Baseline_Models/Regression/optimised_knnr_test_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Regression/optimised_knnr_test_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnr, X_test, y_test, 50, "Regression", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -1.53 with a 95% confidence interval of [-2.11,-1.05]
Median R2: -0.18 with a 95% confidence interval of [-0.49,0.20]


## Decision Tree Regressor (DTR)

In [33]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeRegressor(random_state=0))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeRegressor(random_state=0))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeRegressor(random_state=0),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 0,
 'model__splitter': 'best'}

In [34]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__criterion': Categorical(
                              ['squared_error', 'friedman_mse', 'absolute_error']),
                          'model__splitter': Categorical(['best', 'random']),
                          'model__max_features': Categorical([None, 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [35]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_dtr = model.best_estimator_
#
# y_train_pred = optimised_dtr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_dtr, 'Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", model.cv_results_)

In [36]:
# Load Model
optimised_dtr = load('Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')

In [37]:
get_confidence_intervals(optimised_dtr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.00 with a 95% confidence interval of [-0.00,0.00]
Median R2: 1.00 with a 95% confidence interval of [1.00,1.00]


In [38]:
optimised_dtr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeRegressor(criterion='friedman_mse', random_state=0))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeRegressor(criterion='friedman_mse', random_state=0),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'friedman_mse',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 0,
 'model__splitter': 'best'}

In [39]:
dtr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", allow_pickle=True).tolist())
dtr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_features,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,6.529446,0.125338,0.012341,0.001286,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
31,6.327717,0.077765,0.012804,0.002484,squared_error,,best,"{'model__criterion': 'squared_error', 'model__...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
30,6.011005,0.131041,0.013201,0.002486,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
28,6.147258,0.074005,0.014719,0.002345,squared_error,,best,"{'model__criterion': 'squared_error', 'model__...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
35,6.070777,0.131383,0.014197,0.003121,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
48,5.967787,0.091202,0.017413,0.004867,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
39,6.330829,0.141226,0.013504,0.002862,squared_error,,best,"{'model__criterion': 'squared_error', 'model__...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
20,6.752951,0.104479,0.013069,0.001747,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
40,5.642217,0.149316,0.012978,0.001959,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1
41,6.155995,0.260092,0.011953,0.001889,friedman_mse,,best,"{'model__criterion': 'friedman_mse', 'model__m...",0.31374,0.293883,0.401373,0.300528,0.34195,0.330295,0.039176,1


### Testing

In [40]:
get_confidence_intervals(optimised_dtr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -3.11 with a 95% confidence interval of [-3.99,-2.36]
Median R2: -2.28 with a 95% confidence interval of [-6.37,-0.90]


## Random Forest Regressor (RFR)

In [41]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestRegressor(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestRegressor(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestRegressor(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__maxBins': 256,
 'model__max_depth': None,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__minBinSize': 1,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [42]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_estimators': Integer(10, 100),
                          'model__criterion': Categorical(['squared_error', 'absolute_error']),
                          'model__max_features': Categorical([None, 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [43]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_rfr = model.best_estimator_
#
# y_train_pred = optimised_rfr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_rfr, 'Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", model.cv_results_)

In [44]:
# Load Model
optimised_rfr = load('Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')

In [45]:
get_confidence_intervals(optimised_rfr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.33 with a 95% confidence interval of [-0.36,-0.30]
Median R2: 0.95 with a 95% confidence interval of [0.94,0.96]


In [46]:
optimised_rfr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestRegressor(max_features=None, random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestRegressor(max_features=None, random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [47]:
rfr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", allow_pickle=True).tolist())
rfr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_features,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,181.662234,2.022586,0.034746,0.000421,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
48,181.35827,1.502864,0.034704,0.000302,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
31,239.278484,2.37042,0.054824,0.021809,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
33,239.610449,2.411718,0.042771,0.005187,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
35,238.164795,1.830128,0.044101,0.009625,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
16,215.20983,1.502497,0.039862,0.002446,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
13,213.660321,1.639433,0.051973,0.029282,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
40,239.577166,3.449644,0.039574,0.000406,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
41,239.814864,2.140951,0.041372,0.002512,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1
39,238.539791,2.81393,0.044449,0.006467,squared_error,,100,"{'model__criterion': 'squared_error', 'model__...",0.663731,0.641327,0.643616,0.64131,0.625677,0.643132,0.012131,1


### Testing

In [48]:
get_confidence_intervals(optimised_rfr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -2.50 with a 95% confidence interval of [-2.83,-2.18]
Median R2: -0.46 with a 95% confidence interval of [-1.73,-0.01]


## Stochastic Gradient Descent Regressor (SGDR)

In [49]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDRegressor(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDRegressor(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDRegressor(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.01,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'invscaling',
 'model__loss': 'squared_error',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__penalty': 'l2',
 'model__power_t': 0.25,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [50]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__loss': Categorical(
                              ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                      },
                      scoring='r2',
                      cv=5,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [51]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_sgdr = model.best_estimator_
#
# y_train_pred = optimised_sgdr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_sgdr, 'Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", model.cv_results_)

In [52]:
# Load Model
optimised_sgdr = load('Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')

In [53]:
get_confidence_intervals(optimised_sgdr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.88 with a 95% confidence interval of [-0.98,-0.78]
Median R2: 0.54 with a 95% confidence interval of [0.46,0.60]


In [54]:
optimised_sgdr.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   SGDRegressor(alpha=0.0046788604247112444, learning_rate='optimal',
                loss='epsilon_insensitive', penalty='elasticnet', random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDRegressor(alpha=0.0046788604247112444, learning_rate='optimal',
              loss='epsilon_insensitive', penalty='elasticnet', random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0046788604247112444,
 'model__average': False,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.01,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'optimal',
 'model__loss': 'epsilon_insensitive',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__penalty': 'elasticnet',
 'model__power_t': 0.25,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fractio

In [55]:
sgdr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", allow_pickle=True).tolist())
sgdr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__learning_rate,param_model__loss,param_model__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,7.988788,0.873798,0.019589,0.004331,0.004679,optimal,epsilon_insensitive,elasticnet,"{'model__alpha': 0.0046788604247112444, 'model...",0.4929208,0.4833207,0.4836285,0.4816389,0.4435113,0.4770041,0.0172068,1
7,1.091688,0.051659,0.021595,0.003604,0.000521,optimal,huber,l2,"{'model__alpha': 0.0005212131190318165, 'model...",0.4691575,0.4812977,0.4947266,0.4676726,0.433978,0.4693665,0.02020271,2
10,0.53128,0.017666,0.024101,0.001111,0.005212,optimal,huber,l2,"{'model__alpha': 0.005212004771028612, 'model_...",0.4788611,0.4714036,0.4595471,0.4657803,0.4219899,0.4595164,0.01981412,3
6,0.602592,0.022102,0.026664,0.002197,0.001217,invscaling,huber,l2,"{'model__alpha': 0.0012172976749510152, 'model...",0.4842929,0.4674415,0.4602012,0.4661645,0.4163706,0.4588942,0.02271993,4
11,0.510977,0.004297,0.032802,0.001941,0.005657,optimal,huber,l2,"{'model__alpha': 0.005657063629115877, 'model_...",0.4768305,0.4662248,0.4564094,0.4642834,0.4171835,0.4561863,0.02056105,5
12,0.563295,0.018415,0.038264,0.001573,0.005945,optimal,huber,l2,"{'model__alpha': 0.005944698214965892, 'model_...",0.4756105,0.4654055,0.4556711,0.462194,0.4154837,0.4548729,0.02072068,6
13,0.586123,0.013636,0.030581,0.004125,0.006148,optimal,huber,l2,"{'model__alpha': 0.0061483292235691815, 'model...",0.4751354,0.4644367,0.4545819,0.4616289,0.4139672,0.45395,0.02105603,7
14,0.596204,0.011068,0.022741,0.001873,0.006302,optimal,huber,l2,"{'model__alpha': 0.0063022291343780516, 'model...",0.4741176,0.4635313,0.4543972,0.4604908,0.4137593,0.4532592,0.02075777,8
15,0.519688,0.017809,0.018598,0.002156,0.006403,optimal,huber,l2,"{'model__alpha': 0.006402514059565198, 'model_...",0.4739197,0.4634077,0.4531348,0.4602791,0.4132703,0.4528023,0.02086725,9
16,0.497133,0.00816,0.027202,0.00172,0.006475,optimal,huber,l2,"{'model__alpha': 0.006474710761227967, 'model_...",0.4733808,0.4632228,0.4533969,0.4598861,0.4126287,0.452503,0.02095741,10


### Testing

In [56]:
get_confidence_intervals(optimised_sgdr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -2.50 with a 95% confidence interval of [-3.13,-1.95]
Median R2: -1.12 with a 95% confidence interval of [-2.85,-0.17]
