In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import spearmanr

Hyperparameter tuning: with `X_cv` and `y_cv`, once for every model.

Evaluation: randomly split `X_rest` and `y_rest` to train and test sets.

In [None]:
prop = 'bulk'
with open('../data/ds_hea_{}.pkl'.format(prop), 'rb') as f:
    data = pickle.load(f)
result_path = 'conventional_results/{}/'.format(prop)

predictors = np.stack([np.concatenate(dp[0]) for dp in data])
response = np.array([dp[1] for dp in data])

X_rest, X_cv, y_rest, y_cv = train_test_split(predictors, response, test_size=0.4, random_state=42)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 30, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

rf = RandomForestRegressor()
rf_cv = GridSearchCV(rf, parameters, n_jobs=-1, verbose=2)
rf_cv.fit(X_cv, y_cv)
rf_cv.best_params_

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []
target_results = []
predicted_results = []

for i in range(30):
  X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

  rf = RandomForestRegressor(
     n_estimators=200,
     max_depth=30,
     min_samples_split=2,
     min_samples_leaf=1,
     max_features=None,
  ).fit(X_train, y_train)

  y_pred = rf.predict(X_test)
  rs = spearmanr(y_test, y_pred).correlation
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  spearman_rs.append(rs)
  mae_scores.append(mae)
  r2_scores.append(r2)
  target_results.extend(y_test)
  predicted_results.extend(y_pred)

print("Average Spearman correlation:", np.mean(spearman_rs))
print("Average Mean absolute error:", np.mean(mae_scores))
print("Average R2 score:", np.mean(r2_scores))

results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

results_tp = pd.DataFrame({'Target': target_results, 'Predicted': predicted_results})

results_tp.to_csv(result_path + 'Random_forest_target_predicted.csv', index=False)

results_df.to_csv(result_path + 'Random_forest.csv')

Gradient Boosting Tree

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

gb = GradientBoostingRegressor()
gb_cv = GridSearchCV(gb, parameters, n_jobs=-1, verbose=2)
gb_cv.fit(X_cv, y_cv)
best_params = gb_cv.best_params_
print (best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []
target_results = []
predicted_results = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

    gb = GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=4,
        max_features='sqrt'
    ).fit(X_train, y_train)

    y_pred = gb.predict(X_test)
    rs = spearmanr(y_test, y_pred).correlation
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    spearman_rs.append(rs)
    mae_scores.append(mae)
    r2_scores.append(r2)
    target_results.extend(y_test)
    predicted_results.extend(y_pred)

print("Average Spearman correlation:", np.mean(spearman_rs))
print("Average Mean absolute error:", np.mean(mae_scores))
print("Average R2 score:", np.mean(r2_scores))

results_tp = pd.DataFrame({'Target': target_results, 'Predicted': predicted_results})

results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

results_tp.to_csv(result_path + 'Gradient_boost_target_predicted.csv', index=False)
results_df.to_csv(result_path + 'Gradient_boosting.csv')

Other models: linear/lasso/ridge regression, svm, knn, gaussian process...

Lasso Regression 

In [None]:
from sklearn.linear_model import Lasso

parameters = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    'max_iter': [1000, 5000, 10000]
}

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, parameters, n_jobs=-1, verbose=2)
lasso_cv.fit(X_cv, y_cv)
best_params = lasso_cv.best_params_
print (best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

    lasso = Lasso(
        alpha=0.0001,
        max_iter=1000,
       
    ).fit(X_train, y_train)

    y_pred = lasso.predict(X_test)
    rs = spearmanr(y_test, y_pred).correlation
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    spearman_rs.append(rs)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Average Spearman correlation:", np.mean(spearman_rs))
print("Average Mean absolute error:", np.mean(mae_scores))
print("Average R2 score:", np.mean(r2_scores))

lasso_results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

lasso_results_df.to_csv(result_path + 'Lasso.csv')


Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

parameters = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'max_iter': [1000, 5000, 10000, 20000, 30000],
}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, parameters, n_jobs=-1, verbose=2)
ridge_cv.fit(X_cv, y_cv)
best_params = ridge_cv.best_params_
print (best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

    ridge = Ridge(
        alpha=0.01,
        max_iter=1000,
    ).fit(X_train, y_train)

    y_pred = ridge.predict(X_test)
    rs = spearmanr(y_test, y_pred).correlation
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    spearman_rs.append(rs)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Average Spearman correlation (Ridge):", np.mean(spearman_rs))
print("Average Mean absolute error (Ridge):", np.mean(mae_scores))
print("Average R2 score (Ridge):", np.mean(r2_scores))

ridge_results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

ridge_results_df.to_csv(result_path + 'Ridge.csv')


Support Vector Regression (SVR)

In [None]:
from sklearn.svm import SVR

parameters = {
    'gamma': ['scale', 'auto'],
    'epsilon': [0.01, 0.1, 0.2, 0.3],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'max_iter': [5000, 10000],
}
svr = SVR()
svr_cv = GridSearchCV(svr, parameters, n_jobs=-1, verbose=2)
svr_cv.fit(X_cv, y_cv)
best_params = svr_cv.best_params_

print(best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

    svr = SVR(
        gamma='scale',
        epsilon=0.3,
        kernel='linear',
        max_iter=5000
    ).fit(X_train, y_train)

    y_pred = svr.predict(X_test)
    rs = spearmanr(y_test, y_pred).correlation
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    spearman_rs.append(rs)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Average Spearman correlation (SVR):", np.mean(spearman_rs))
print("Average Mean absolute error (SVR):", np.mean(mae_scores))
print("Average R2 score (SVR):", np.mean(r2_scores))

svr_results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

svr_results_df.to_csv(result_path + 'SVM.csv')

k-nearest neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

parameters = {
    'n_neighbors': [3, 5, 7, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40, 50]
}

knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, parameters, n_jobs=-1, verbose=2)
knn_cv.fit(X_cv, y_cv)
best_params = knn_cv.best_params_

print(best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=i)

    knn = KNeighborsRegressor(
        n_neighbors=5,
        weights='distance',
        algorithm='auto',
        leaf_size=20
    ).fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    rs = spearmanr(y_test, y_pred).correlation
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    spearman_rs.append(rs)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Average Spearman correlation (KNN):", np.mean(spearman_rs))
print("Average Mean absolute error (KNN):", np.mean(mae_scores))
print("Average R2 score (KNN):", np.mean(r2_scores))

knn_results_df = pd.DataFrame({
    'Spearman': spearman_rs,
    'MAE': mae_scores,
    'R2': r2_scores
})

knn_results_df.to_csv(result_path + 'KNN.csv')

Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct

parameters = {
    'kernel': [RBF()],
    'alpha': [1e-10]
}

gpr = GaussianProcessRegressor()
gpr_cv = GridSearchCV(gpr, parameters, n_jobs=-1, verbose=2)
gpr_cv.fit(X_cv, y_cv)
best_params = gpr_cv.best_params_

print(best_params)

In [None]:
spearman_rs = []
mae_scores = []
r2_scores = []

X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=42)

gpr = GaussianProcessRegressor(
    kernel=RBF(),
    alpha=1e-10,
).fit(X_train, y_train)

y_pred = gpr.predict(X_test)
rs = spearmanr(y_test, y_pred).correlation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mae, rs, r2)