# Imports

In [1]:
# General Imports
from models_utils import *

# Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR

# Training & Test Sets


In [2]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [3]:
X_train = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection")
X_train.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Regression/y_train")
y_train_binary = y_train.loc[:, "Activity_Binary"]
y_train.drop(columns=["Activity_Binary"], inplace=True)
y_train = y_train.to_numpy().flatten()

In [4]:
X_test = load_from_pickle("Training_Test_Sets/Regression/X_test_feature_selection")
X_test.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Regression/y_test")
y_test_binary = y_test.loc[:, "Activity_Binary"]
y_test.drop(columns=["Activity_Binary"], inplace=True)
y_test = y_test.to_numpy().flatten()

In [5]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train_binary[y_train_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train_binary[y_train_binary == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test_binary[y_test_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test_binary[y_test_binary == 0].shape[0]})")

X_train shape: (10956, 693)
y_train shape: 10956 (Binding Count: 3796, Non-Binding Count: 7160)
X_test shape: (102, 693)
y_test shape: 102 (Binding Count: 27, Non-Binding Count: 75)


# Model Training & Testing

In [6]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Regressor (DR)

In [35]:
dummy_regressor = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyRegressor()),
    ]
)
dummy_regressor.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', DummyRegressor())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyRegressor(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__quantile': None,
 'model__strategy': 'mean'}

### Training

In [36]:
# dummy_regressor.fit(X_train, y_train)
#
# y_train_pred = dummy_regressor.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model
# dump(dummy_regressor, 'Dataset_Files/Baseline_Models/Regression/dr.joblib')

R2 Score: 0.0
Negated Mean Absolute Error: -2.103515625


['Dataset_Files/Baseline_Models/Regression/dr.joblib']

In [37]:
# Load Model
dummy_regressor = load('Dataset_Files/Baseline_Models/Regression/dr.joblib')

In [38]:
get_confidence_intervals(dummy_regressor, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -2.10 with a 95% confidence interval of [-2.21,-2.01]
Median R2: -0.00 with a 95% confidence interval of [-0.01,-0.00]


### Testing

In [39]:
get_confidence_intervals(dummy_regressor, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -1.98 with a 95% confidence interval of [-2.34,-1.69]
Median R2: -0.03 with a 95% confidence interval of [-0.33,-0.00]


## Linear Regression (LR)

In [40]:
linear_regression = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearRegression(n_jobs=-1))
    ]
)
linear_regression.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LinearRegression(n_jobs=-1))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearRegression(n_jobs=-1),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__n_jobs': -1,
 'model__normalize': 'deprecated',
 'model__positive': False}

### Training

In [41]:
# linear_regression.fit(X_train, y_train)
#
# y_train_pred = linear_regression.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model
# dump(linear_regression, 'Dataset_Files/Baseline_Models/Regression/lr.joblib')

R2 Score: 0.6277162841191646
Negated Mean Absolute Error: -1.0437039136886597


['Dataset_Files/Baseline_Models/Regression/lr.joblib']

In [42]:
# Load Model
linear_regression = load('Dataset_Files/Baseline_Models/Regression/lr.joblib')

In [43]:
get_confidence_intervals(linear_regression, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -1.04 with a 95% confidence interval of [-1.12,-0.97]
Median R2: 0.63 with a 95% confidence interval of [0.57,0.68]


### Testing

In [44]:
get_confidence_intervals(linear_regression, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -3.63 with a 95% confidence interval of [-4.45,-2.85]
Median R2: -3.17 with a 95% confidence interval of [-7.08,-1.36]


## Linear Support Vector Regression (LSVR)

In [45]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearSVR(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', LinearSVR(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LinearSVR(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__dual': True,
 'model__epsilon': 0.0,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1.0,
 'model__loss': 'epsilon_insensitive',
 'model__max_iter': 1000,
 'model__random_state': 42,
 'model__tol': 0.0001,
 'model__verbose': 0}

In [46]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__epsilon': Real(1e-6, 1e+2, prior='log-uniform'),
                          'model__C': Real(1e-6, 1e+2, prior='log-uniform'),
                          'model__loss': Categorical(['epsilon_insensitive','squared_epsilon_insensitive']),
                          'model__max_iter': Integer(500, 5000),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [47]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_lsvr = model.best_estimator_
#
# y_train_pred = optimised_lsvr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_lsvr, 'Dataset_Files/Baseline_Models/Regression/optimised_lsvr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_lsvr_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33
Iteration Completed: 34
Iteration Completed: 35
Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42
I

In [52]:
# Load Model
optimised_lsvr = load('Dataset_Files/Baseline_Models/Regression/optimised_lsvr.joblib')

In [54]:
get_confidence_intervals(optimised_lsvr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -1.07 with a 95% confidence interval of [-1.14,-0.99]
Median R2: 0.62 with a 95% confidence interval of [0.57,0.67]


In [None]:
optimised_lsvr.get_params()

In [55]:
lsvr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_lsvr_cv_results.npy", allow_pickle=True).tolist())
lsvr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
lsvr_grid_search_dataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__epsilon,param_model__loss,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
40,5.733414,0.102186,0.017663,0.005012,0.002885,4.9e-05,squared_epsilon_insensitive,5000,"{'model__C': 0.002885405558519924, 'model__eps...",0.614209,0.546284,0.573572,0.53513,0.575398,0.523122,0.571672,0.524606,0.53277,0.493583,0.549035,0.032963,1
41,4.239263,0.063931,0.018279,0.003125,0.002044,2.1e-05,squared_epsilon_insensitive,5000,"{'model__C': 0.002043966407929869, 'model__eps...",0.613314,0.549749,0.572533,0.533623,0.57304,0.522517,0.57169,0.524677,0.533332,0.494524,0.5489,0.032444,2
49,9.245845,0.089831,0.015926,0.003097,0.004974,0.080372,squared_epsilon_insensitive,5000,"{'model__C': 0.004974197388548909, 'model__eps...",0.613259,0.539382,0.573764,0.535391,0.577069,0.522452,0.568998,0.522868,0.53122,0.492682,0.547708,0.033267,3
30,10.402666,0.138166,0.018531,0.005813,0.006618,2.6e-05,squared_epsilon_insensitive,500,"{'model__C': 0.006617515391390669, 'model__eps...",0.614161,0.535229,0.573827,0.537223,0.578651,0.523456,0.567093,0.523218,0.530873,0.490199,0.547393,0.033874,4
42,10.961124,0.11142,0.017702,0.002064,0.006664,0.000164,squared_epsilon_insensitive,5000,"{'model__C': 0.006663895374404001, 'model__eps...",0.614152,0.535126,0.573811,0.537261,0.578669,0.523454,0.567022,0.523201,0.530841,0.49018,0.547372,0.033878,5
38,15.68717,0.189066,0.020932,0.004362,0.009809,4e-06,squared_epsilon_insensitive,5000,"{'model__C': 0.009808628650870165, 'model__eps...",0.613329,0.529397,0.572999,0.538012,0.579356,0.523009,0.563026,0.522319,0.529999,0.4879,0.545935,0.034242,6
15,20.896913,0.351137,0.020249,0.006604,0.012549,1.1e-05,squared_epsilon_insensitive,4257,"{'model__C': 0.012549284613691784, 'model__eps...",0.612609,0.525778,0.572172,0.538507,0.579609,0.522514,0.560044,0.521751,0.529506,0.48611,0.54486,0.034495,7
47,29.709077,0.324466,0.014729,0.003889,0.019401,0.000793,squared_epsilon_insensitive,5000,"{'model__C': 0.019401398754580638, 'model__eps...",0.610956,0.519557,0.570217,0.539432,0.579704,0.521245,0.554311,0.52078,0.528656,0.482258,0.542712,0.035011,8
33,37.690242,0.267864,0.015699,0.00427,0.024915,0.000103,squared_epsilon_insensitive,5000,"{'model__C': 0.0249149079277501, 'model__epsil...",0.609884,0.51614,0.568807,0.53997,0.579608,0.520367,0.550916,0.520264,0.528211,0.479571,0.541374,0.03538,9
43,42.633819,0.47405,0.014206,0.003699,0.027768,2e-06,squared_epsilon_insensitive,5000,"{'model__C': 0.02776832535627547, 'model__epsi...",0.609396,0.514701,0.568142,0.540192,0.579518,0.519952,0.54945,0.520015,0.527975,0.478339,0.540768,0.035554,10


### Testing

In [60]:
get_confidence_intervals(optimised_lsvr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -2.44 with a 95% confidence interval of [-3.01,-1.92]
Median R2: -0.94 with a 95% confidence interval of [-2.82,-0.13]


## K-Nearest Neighbors Regressor (KNNR)

In [61]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsRegressor())
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsRegressor())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsRegressor(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [62]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_neighbors': Integer(4, 20),
                          'model__weights': Categorical(['uniform', 'distance']),
                          'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [63]:
# index = 1
# model.fit(X_train, y_train, callback=on_step)
#
# optimised_knnr = model.best_estimator_
#
# y_train_pred = optimised_knnr.predict(X_train)
# calculate_metrics_regression(y_train, y_train_pred)
#
# # Save Model & CV Results
# dump(optimised_knnr, 'Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')
# np.save("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30




Iteration Completed: 31




Iteration Completed: 32
Iteration Completed: 33




Iteration Completed: 34
Iteration Completed: 35




Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39




Iteration Completed: 40




Iteration Completed: 41




Iteration Completed: 42
Iteration Completed: 43




Iteration Completed: 44
Iteration Completed: 45
Iteration Completed: 46




Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
R2 Score: 1.0
Negated Mean Absolute Error: -0.0


In [64]:
# Load Model
optimised_knnr = load('Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')

In [65]:
y_train_pred = optimised_knnr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

R2 Score: 1.0
Negated Mean Absolute Error: -0.0


In [70]:
if os.path.exists("Dataset_Files/Baseline_Models/Regression/optimised_knnr_train_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Regression/optimised_knnr_train_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnr, X_train, y_train, 1000, "Regression", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.00 with a 95% confidence interval of [-0.00,0.00]
Median R2: 1.00 with a 95% confidence interval of [1.00,1.00]


In [None]:
optimised_knnr.get_params()

In [None]:
knnr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", allow_pickle=True).tolist())
knnr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnr_grid_search_dataframe

### Testing

In [71]:
y_test_pred = optimised_knnr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

R2 Score: -0.1380421442413382
Negated Mean Absolute Error: -1.5444833534434703


In [73]:
if os.path.exists("Dataset_Files/Baseline_Models/Regression/optimised_knnr_test_metrics.txt"):
    with open("Dataset_Files/Baseline_Models/Regression/optimised_knnr_test_metrics.txt", "r") as file:
        print(file.read())
else:
    get_confidence_intervals(optimised_knnr, X_test, y_test, 50, "Regression", print_iterator=True)

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -1.54 with a 95% confidence interval of [-2.09,-1.06]
Median R2: -0.17 with a 95% confidence interval of [-0.52,0.17]


## Decision Tree Regressor (DTR)

In [77]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeRegressor(random_state=0))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DecisionTreeRegressor(random_state=0))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DecisionTreeRegressor(random_state=0),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__random_state': 0,
 'model__splitter': 'best'}

In [78]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__criterion': Categorical(
                              ['squared_error', 'friedman_mse', 'absolute_error']),
                          'model__splitter': Categorical(['best', 'random']),
                          'model__max_features': Categorical([None, 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [79]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_dtr = model.best_estimator_

y_train_pred = optimised_dtr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_dtr, 'Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12




Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20




Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24




Iteration Completed: 25




Iteration Completed: 26




Iteration Completed: 27




Iteration Completed: 28




Iteration Completed: 29




Iteration Completed: 30




Iteration Completed: 31




Iteration Completed: 32




Iteration Completed: 33




Iteration Completed: 34




Iteration Completed: 35




Iteration Completed: 36




Iteration Completed: 37




Iteration Completed: 38




Iteration Completed: 39




Iteration Completed: 40




Iteration Completed: 41




Iteration Completed: 42




Iteration Completed: 43




Iteration Completed: 44




Iteration Completed: 45




Iteration Completed: 46




Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
R2 Score: 1.0
Negated Mean Absolute Error: -0.0


In [None]:
# Load Model
optimised_dtr = load('Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')

In [83]:
get_confidence_intervals(optimised_dtr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.00 with a 95% confidence interval of [-0.00,0.00]
Median R2: 1.00 with a 95% confidence interval of [1.00,1.00]


In [None]:
optimised_dtr.get_params()

In [None]:
dtr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", allow_pickle=True).tolist())
dtr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtr_grid_search_dataframe

### Testing

In [85]:
get_confidence_intervals(optimised_dtr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -3.12 with a 95% confidence interval of [-3.89,-2.35]
Median R2: -2.33 with a 95% confidence interval of [-5.80,-0.83]


## Random Forest Regressor (RFR)

In [86]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestRegressor(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestRegressor(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestRegressor(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': 1.0,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [87]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_estimators': Integer(10, 100),
                          'model__criterion': Categorical(['squared_error', 'absolute_error']),
                          'model__max_features': Categorical([None, 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_rfr = model.best_estimator_

y_train_pred = optimised_rfr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_rfr, 'Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_rfr = load('Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')

In [None]:
y_train_pred = optimised_rfr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_rfr.get_params()

In [None]:
rfr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", allow_pickle=True).tolist())
rfr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_rfr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Stochastic Gradient Descent Regressor (SGDR)

In [7]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDRegressor(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', SGDRegressor(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': SGDRegressor(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.01,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'invscaling',
 'model__loss': 'squared_error',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__penalty': 'l2',
 'model__power_t': 0.25,
 'model__random_state': 42,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [8]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__loss': Categorical(
                              ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [9]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_sgdr = model.best_estimator_

y_train_pred = optimised_sgdr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_sgdr, 'Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", model.cv_results_)

Iteration Completed: 1
Iteration Completed: 2
Iteration Completed: 3
Iteration Completed: 4
Iteration Completed: 5
Iteration Completed: 6
Iteration Completed: 7
Iteration Completed: 8
Iteration Completed: 9
Iteration Completed: 10
Iteration Completed: 11
Iteration Completed: 12
Iteration Completed: 13
Iteration Completed: 14
Iteration Completed: 15
Iteration Completed: 16
Iteration Completed: 17
Iteration Completed: 18
Iteration Completed: 19
Iteration Completed: 20
Iteration Completed: 21
Iteration Completed: 22
Iteration Completed: 23
Iteration Completed: 24
Iteration Completed: 25
Iteration Completed: 26
Iteration Completed: 27
Iteration Completed: 28
Iteration Completed: 29
Iteration Completed: 30
Iteration Completed: 31
Iteration Completed: 32
Iteration Completed: 33
Iteration Completed: 34
Iteration Completed: 35
Iteration Completed: 36
Iteration Completed: 37
Iteration Completed: 38
Iteration Completed: 39
Iteration Completed: 40
Iteration Completed: 41
Iteration Completed: 42




Iteration Completed: 43




Iteration Completed: 44




Iteration Completed: 45




Iteration Completed: 46




Iteration Completed: 47




Iteration Completed: 48




Iteration Completed: 49




Iteration Completed: 50
R2 Score: 0.5435649115986829
Negated Mean Absolute Error: -0.9537931845981886


In [10]:
# Load Model
optimised_sgdr = load('Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')

In [12]:
get_confidence_intervals(optimised_sgdr, X_train, y_train, 1000, "Regression")

Metrics after 1000 bootstrapped samples of size 1000
--------------------------------------------------------
Median Negated-MAE: -0.95 with a 95% confidence interval of [-1.06,-0.86]
Median R2: 0.55 with a 95% confidence interval of [0.47,0.61]


In [None]:
optimised_sgdr.get_params()

In [None]:
sgdr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", allow_pickle=True).tolist())
sgdr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdr_grid_search_dataframe

### Testing

In [14]:
get_confidence_intervals(optimised_sgdr, X_test, y_test, 50, "Regression")

Metrics after 1000 bootstrapped samples of size 50
--------------------------------------------------------
Median Negated-MAE: -2.58 with a 95% confidence interval of [-3.23,-2.03]
Median R2: -1.23 with a 95% confidence interval of [-3.39,-0.29]
