In [8]:
#Import standard packages for model training

import sklearn as SK
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import *
from skopt import BayesSearchCV
import numpy as np
import pandas as pd
import scipy.stats
from math import sqrt
import os 
import io
from tensorboard.plugins.hparams import api as hp
from utils import utils

commons = utils.Commons()
ts_helper = utils.TS_Helper()
shap_helper = utils.Shap_Helper()

ts_helper.model_type = ts_helper.Regression
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "-1"

In [9]:
#Inport train, validation and test sets

training = './data/Classification/Tb.brucei/scaffold_split/train_fold_4.csv'
validation = './data/Classification/Tb.brucei/scaffold_split/valid_fold_4.csv'
test = './data/Classification/Tb.brucei/scaffold_split/test_fold_4.csv'
TASK_START = 2
NUM_TASKS = 1
SMILES = 'SMILES'
train_dataset,y_train,train_smiles = commons.load_dataset(training,SMILES,TASK_START,NUM_TASKS)
valid_dataset,y_val,val_smiles = commons.load_dataset(validation,SMILES,TASK_START,NUM_TASKS)
test_dataset,y_test,test_smiles = commons.load_dataset(test,SMILES,TASK_START,NUM_TASKS)

train_dataset.head()

Loaded dataset ./data/Classification/Tb.brucei/scaffold_split/train_fold_4.csv with shape: (1051, 3)
Loaded dataset ./data/Classification/Tb.brucei/scaffold_split/valid_fold_4.csv with shape: (132, 3)
Loaded dataset ./data/Classification/Tb.brucei/scaffold_split/test_fold_4.csv with shape: (131, 3)


Unnamed: 0,ID,SMILES,Tb.brucei
0,1049,CCN1N=C(C=C1C)C(=O)NC1=NN=C(CCSC2=CC=CC=C2)O1,0
1,798,CC1=CC=CC(NC2=NC(NC3=CC=C4OCOC4=C3)=NC(=N2)N2C...,1
2,1155,COC1=CC=C(CCNC(=O)NCC2CN(C(=O)C2)C2=CC=C(Cl)C=...,0
3,204,CC1OC(C(O)C(O)C1O)N1C=C(CNC2=CC3=C4C(=CC=C5C(=...,1
4,206,CN(C)CCCN1C(=O)C2=CC(NCC3=CN(N=N3)C3OC(CO)C(O)...,1


In [10]:
# calculate ECFP (defaut) fingerprints using RDKit
RADIUS = 2 #diameter 4
FP_SIZE = 2048  #bit string size
FEAT = False #used when you consider pharmacophoric features
X_train = commons.assing_fp(train_smiles,FP_SIZE,RADIUS,FEAT)
X_test = commons.assing_fp(test_smiles,FP_SIZE,RADIUS,FEAT)

y_train = y_train.ravel()
y_train = np.array(y_train).astype(int)
y_test = y_test.ravel()
y_test = np.array(y_test).astype(int)

In [11]:
# parameters for train model

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scorer = make_scorer(mean_squared_error)

# log-uniform: understand as search over p = exp(x) by varying x

model = BayesSearchCV(RandomForestRegressor(),
    {
    'max_features': ['auto', 'sqrt'],
    'n_estimators': [2, 150],
    "max_depth": [2, 10],
    },
    n_iter=5, # Number of parameter settings that are sampled
    cv=cv,
    scoring = scorer,
    refit = True, # Refit the best estimator with the entire dataset.
    random_state=42,
    n_jobs = -1
)

model.fit(X_train, y_train)

print("Best parameters: %s" % model.best_params_)

Best parameters: OrderedDict([('max_depth', 5), ('max_features', 'sqrt'), ('n_estimators', 140)])


In [12]:
#Fit model using best hyperparameters

model = RandomForestRegressor(**model.best_params_)
model.fit(X_train, y_train)

In [13]:
#Statistical characteristico of model without 3-sigma rule

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


r2, p = scipy.stats.pearsonr(y_train,y_pred_train)
print(("Train set results"))
print ("r2\t%.2f"  % r2)
print ("RMSE\t%.2f"  % sqrt(mean_squared_error(y_pred_train, y_train)))
print ("MSE\t%.2f"  % (mean_squared_error(y_pred_train, y_train)))
print ("MAE\t%.2f"  % mean_absolute_error(y_pred_train, y_train))

r2, p_ext = scipy.stats.pearsonr(y_test, y_pred_test)
print(("Test set results"))
print ("r2\t%.2f"  % r2)
print ("RMSE\t%.2f"  % sqrt(mean_squared_error(y_pred_test, y_test)))
print ("MSE\t%.2f"  % (mean_squared_error(y_pred_test, y_test)))
print ("MAE\t%.2f"  % mean_absolute_error(y_pred_test, y_test))

Train set results
r2	0.59
RMSE	0.44
MSE	0.20
MAE	0.44
Test set results
r2	0.55
RMSE	0.47
MSE	0.22
MAE	0.46


In [14]:
#Statistical characteristics of model using 3-sigma rule

train_pred = pd.DataFrame(y_train,y_pred_train) 
train_pred['y_pred'] = train_pred.index
train_pred = train_pred.rename(columns = {0: 'y_obs'})
train_pred2 = train_pred.reset_index(drop=True)
train_pred2['Folds'] = 'Train'
train_pred2 = train_pred2.assign(Folds_error = abs(train_pred2['y_pred'] - train_pred2['y_obs']))
train_pred2['Folds error Mean'] = train_pred2['Folds_error'].mean() 
train_pred2['Folds error 3*sigma'] = train_pred2['Folds_error'].std()
train_pred2['Folds error 3*sigma'] = train_pred2['Folds error 3*sigma']*3
train_pred2=train_pred2[train_pred2['Folds_error']<=(train_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

r2 = (train_pred2["y_obs"].corr(train_pred2["y_pred"]))    
print(("Train set results"))
print("r^2\t%.2f" % r2)
print ("RMSE\t%.2f" % sqrt(mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
print ("MSE\t%.2f" % (mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
print ("MAE\t%.2f"  %mean_absolute_error(train_pred2["y_obs"],train_pred2["y_pred"]))   

test_pred = pd.DataFrame(y_test,y_pred_test) 
test_pred['y_pred'] = test_pred.index
test_pred = test_pred.rename(columns = {0: 'y_obs'})
test_pred2 = test_pred.reset_index(drop=True)
test_pred2['Folds'] = 'Train'
test_pred2 = test_pred2.assign(Folds_error = abs(test_pred2['y_pred'] - test_pred2['y_obs']))
test_pred2['Folds error Mean'] = test_pred2['Folds_error'].mean() 
test_pred2['Folds error 3*sigma'] = test_pred2['Folds_error'].std()
test_pred2['Folds error 3*sigma'] = test_pred2['Folds error 3*sigma']*3
test_pred2 = test_pred2[test_pred2['Folds_error']<=(test_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

r2 = (test_pred2["y_obs"].corr(test_pred2["y_pred"]))    
print(("Train set results"))
print("r^2\t%.2f" % r2)
print ("RMSE\t%.2f" % sqrt(mean_squared_error(test_pred2["y_obs"],test_pred2["y_pred"])))
print ("MSE\t%.2f" % (mean_squared_error(test_pred2["y_obs"],test_pred2["y_pred"])))
print ("MAE\t%.2f"  %mean_absolute_error(test_pred2["y_obs"],test_pred2["y_pred"])) 

Train set results
r^2	0.99
RMSE	0.24
MSE	0.06
MAE	0.24
Train set results
r^2	nan


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.