# Imports

In [None]:
# General Imports
from models_utils import *

# Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Training & Test Sets


In [None]:
feature_selection_columns = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection").loc[:,
                            "MolecularWeight":].columns

In [None]:
X_train = load_from_pickle("Training_Test_Sets/Regression/X_train_feature_selection")
X_train.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_train = X_train.to_numpy()

y_train = load_from_pickle("Training_Test_Sets/Regression/y_train")
y_train_binary = y_train.loc[:, "Activity_Binary"]
y_train.drop(columns=["Activity_Binary"], inplace=True)
y_train = y_train.to_numpy()

In [None]:
X_test = load_from_pickle("Training_Test_Sets/Regression/X_test_feature_selection")
X_test.drop(columns=["Protein_Accession", "Drug_CID", "Activity_Name"], inplace=True)
X_test = X_test.to_numpy()

y_test = load_from_pickle("Training_Test_Sets/Regression/y_test")
y_test_binary = y_test.loc[:, "Activity_Binary"]
y_test.drop(columns=["Activity_Binary"], inplace=True)
y_test = y_test.to_numpy()

In [None]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape[0]} ", end="")
print(f"(Binding Count: {y_train_binary[y_train_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_train_binary[y_train_binary == 0].shape[0]})")

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape[0]} ", end="")
print(f"(Binding Count: {y_test_binary[y_test_binary == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test_binary[y_test_binary == 0].shape[0]})")

# Model Training & Testing

In [None]:
def on_step(optim_result):
    global index
    print(f"Iteration Completed: {index}")
    index += 1

## Dummy Regressor (DR)

In [None]:
dummy_regressor = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyRegressor()),
    ]
)
dummy_regressor.get_params()

### Training

In [None]:
dummy_regressor.fit(X_train, y_train)

y_train_pred = dummy_regressor.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model
dump(dummy_regressor, 'Dataset_Files/Baseline_Models/Regression/dr.joblib')

In [None]:
# Load Model
dummy_regressor = load('Dataset_Files/Baseline_Models/Regression/dr.joblib')

In [None]:
y_train_pred = dummy_regressor.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

### Testing

In [None]:
y_test_pred = dummy_regressor.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Linear Regression (LR)

In [None]:
linear_regression = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LinearRegression(n_jobs=-1))
    ]
)
linear_regression.get_params()

### Training

In [None]:
linear_regression.fit(X_train, y_train)

y_train_pred = linear_regression.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model
dump(linear_regression, 'Dataset_Files/Baseline_Models/Regression/lr.joblib')

In [None]:
# Load Model
linear_regression = load('Dataset_Files/Baseline_Models/Regression/lr.joblib')

In [None]:
y_train_pred = linear_regression.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

### Testing

In [None]:
y_test_pred = linear_regression.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Support Vector Regression (SVR)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SVR())
    ]
)
pipe.get_params()

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
                          'model__degree': Integer(2, 8),
                          'model_gamma': Categorical(['scale', 'auto']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_svr = model.best_estimator_

y_train_pred = optimised_svr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_svr, 'Dataset_Files/Baseline_Models/Regression/optimised_svr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_svr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_svr = load('Dataset_Files/Baseline_Models/Regression/optimised_svr.joblib')

In [None]:
y_train_pred = optimised_svr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_svr.get_params()

In [None]:
svr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_svr_cv_results.npy", allow_pickle=True).tolist())
svr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
svr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_svr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## K-Nearest Neighbors Regressor (KNNR)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', KNeighborsRegressor())
    ]
)
pipe.get_params()

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_neighbors': Integer(4, 20),
                          'model__weights': Categorical(['uniform', 'distance']),
                          'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_knnr = model.best_estimator_

y_train_pred = optimised_knnr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_svr, 'Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_knnr = load('Dataset_Files/Baseline_Models/Regression/optimised_knnr.joblib')

In [None]:
y_train_pred = optimised_knnr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_knnr.get_params()

In [None]:
knnr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_knnr_cv_results.npy", allow_pickle=True).tolist())
knnr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
knnr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_knnr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Decision Tree Regressor (DTR)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DecisionTreeRegressor(random_state=0))
    ]
)
pipe.get_params()

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__criterion': Categorical(
                              ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']),
                          'model__splitter': Categorical(['best', 'random']),
                          'model__max_features': Categorical(['auto', 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_dtr = model.best_estimator_

y_train_pred = optimised_dtr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_svr, 'Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_dtr = load('Dataset_Files/Baseline_Models/Regression/optimised_dtr.joblib')

In [None]:
y_train_pred = optimised_dtr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_dtr.get_params()

In [None]:
dtr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_dtr_cv_results.npy", allow_pickle=True).tolist())
dtr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
dtr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_dtr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Random Forest Regressor (RFR)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestRegressor(random_state=42))
    ]
)
pipe.get_params()

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__n_estimators': Integer(10, 600),
                          'model__criterion': Categorical(['squared_error', 'absolute_error', 'poisson']),
                          'model__max_features': Categorical(['auto', 'sqrt', 'log2']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_rfr = model.best_estimator_

y_train_pred = optimised_rfr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_svr, 'Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_rfr = load('Dataset_Files/Baseline_Models/Regression/optimised_rfr.joblib')

In [None]:
y_train_pred = optimised_rfr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_rfr.get_params()

In [None]:
rfr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_rfr_cv_results.npy", allow_pickle=True).tolist())
rfr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
rfr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_rfr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)

## Stochastic Gradient Descent Regressor (SGDR)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SGDRegressor(random_state=42))
    ]
)
pipe.get_params()

In [None]:
model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__loss': Categorical(['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
                          'model__penalty': Categorical(['l2', 'l1', 'elasticnet']),
                          'model__alpha': Real(1e-6, 1e-1, prior='log-uniform'),
                          'model__learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
                      },
                      scoring='r2',
                      cv=10,
                      error_score=np.nan,
                      n_jobs=-1,
                      pre_dispatch='2*n_jobs',
                      random_state=42)

### Training

In [None]:
index = 1
model.fit(X_train, y_train, callback=on_step)

optimised_sgdr = model.best_estimator_

y_train_pred = optimised_sgdr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

# Save Model & CV Results
dump(optimised_svr, 'Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')
np.save("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", model.cv_results_)

In [None]:
# Load Model
optimised_sgdr = load('Dataset_Files/Baseline_Models/Regression/optimised_sgdr.joblib')

In [None]:
y_train_pred = optimised_sgdr.predict(X_train)
calculate_metrics_regression(y_train, y_train_pred)

In [None]:
optimised_sgdr.get_params()

In [None]:
sgdr_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Baseline_Models/Regression/optimised_sgdr_cv_results.npy", allow_pickle=True).tolist())
sgdr_grid_search_dataframe.sort_values(by=["rank_test_score"], inplace=True)
sgdr_grid_search_dataframe

### Testing

In [None]:
y_test_pred = optimised_sgdr.predict(X_test)
calculate_metrics_regression(y_test, y_test_pred)