In [1]:
#Import standard packages for model training

import sklearn as SK
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import *
from skopt import BayesSearchCV
import numpy as np
import pandas as pd
import scipy.stats
from math import sqrt
import os 
import io
from tensorboard.plugins.hparams import api as hp

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "-1"

In [214]:
#Inport train, validation and test sets

training = './data/Regression/Tb.brucei/scaffold_split/train_fold_4.csv'
validation = './data/Regression/Tb.brucei/scaffold_split/valid_fold_4.csv'
test = './data/Regression/Tb.brucei/scaffold_split/test_fold_4.csv'

train_dataset = pd.read_csv(training, delimiter=',', low_memory=False)
valid_dataset = pd.read_csv(validation, delimiter=',', low_memory=False)
test_dataset = pd.read_csv(test, delimiter=',', low_memory=False)

train_dataset.head()

Unnamed: 0,ID,SMILES,Tb.brucei
0,4317,CN(C1C=CC(C=CC=NNC(=S)N)=CC=1)C,4.751781
1,3961,CCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.69897
2,4169,OCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,5.387216
3,3988,CCCCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.39794
4,4002,CCCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.30103


In [215]:
task_start=2
task_index = 3

# load training dataset

train_dataset = pd.read_csv(training, delimiter=',', low_memory=False)
y_train = np.array(train_dataset.iloc[:,2:task_index].values)
print(f"loaded y_train data: {y_train.shape}")

# load test dataset

test_dataset2 = pd.concat([valid_dataset, test_dataset], axis=0).reset_index(drop=True)
y_test = np.array(test_dataset2.iloc[:,2:task_index].values)
print(f"loaded y_test data: {y_test.shape}")

loaded y_train data: (500, 1)
loaded y_test data: (125, 1)


In [216]:
# calculate ECFP (defaut) fingerprints using RDKit

from utils.fingerprints import *

train_smiles=train_dataset["SMILES"].values
test_smiles=test_dataset2["SMILES"].values
X_train = assing_fp(train_smiles,FP_SIZE,RADIUS)
X_test = assing_fp(test_smiles,FP_SIZE,RADIUS)

y_train = y_train.ravel()
y_train = np.array(y_train).astype(int)
y_test = y_test.ravel()
y_test = np.array(y_test).astype(int)

In [217]:
# parameters for train model

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scorer = make_scorer(mean_squared_error)

# log-uniform: understand as search over p = exp(x) by varying x

model = BayesSearchCV(lgb.LGBMRegressor(),                   
    {
    'learning_rate': (0.01, 0.1, 'uniform'), 
    'num_leaves': (1, 15),
    'n_estimators': (2, 50), 
    'max_depth': (1, 10),
    'subsample': (0.1, 0.3), 
    }, 
    n_iter=5, # Number of parameter settings that are sampled
    cv=cv,
    scoring = scorer,
    refit = True, # Refit the best estimator with the entire dataset.
    random_state=42,
    n_jobs = -1
)

model.fit(X_train, y_train)

print("Best parameters: %s" % model.best_params_)



Best parameters: OrderedDict([('learning_rate', 0.050034926107103674), ('max_depth', 9), ('n_estimators', 7), ('num_leaves', 7), ('subsample', 0.13759103727346972)])


In [218]:
#Fit model using best hyperparameters

model = lgb.LGBMRegressor(**model.best_params_)
model.fit(X_train, y_train)

In [219]:
#Statistical characteristico of model without 3-sigma rule

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

r2, p = scipy.stats.pearsonr(y_train,y_pred_train)
print(("Train set results"))
print ("r2\t%.2f"  % r2)
print ("RMSE\t%.2f"  % sqrt(mean_squared_error(y_pred_train, y_train)))
print ("MSE\t%.2f"  % (mean_squared_error(y_pred_train, y_train)))
print ("MAE\t%.2f"  % mean_absolute_error(y_pred_train, y_train))

r2, p_ext = scipy.stats.pearsonr(y_test, y_pred_test)
print(("Test set results"))
print ("r2\t%.2f"  % r2)
print ("RMSE\t%.2f"  % sqrt(mean_squared_error(y_pred_test, y_test)))
print ("MSE\t%.2f"  % (mean_squared_error(y_pred_test, y_test)))
print ("MAE\t%.2f"  % mean_absolute_error(y_pred_test, y_test))

Train set results
r2	0.61
RMSE	0.91
MSE	0.83
MAE	0.66
Test set results
r2	0.45
RMSE	1.09
MSE	1.19
MAE	0.75


In [220]:
#Statistical characteristics of model using 3-sigma rule

train_pred = pd.DataFrame(y_train,y_pred_train) 
train_pred['y_pred'] = train_pred.index
train_pred = train_pred.rename(columns = {0: 'y_obs'})
train_pred2 = train_pred.reset_index(drop=True)
train_pred2['Folds'] = 'Train'
train_pred2 = train_pred2.assign(Folds_error = abs(train_pred2['y_pred'] - train_pred2['y_obs']))
train_pred2['Folds error Mean'] = train_pred2['Folds_error'].mean() 
train_pred2['Folds error 3*sigma'] = train_pred2['Folds_error'].std()
train_pred2['Folds error 3*sigma'] = train_pred2['Folds error 3*sigma']*3
train_pred2=train_pred2[train_pred2['Folds_error']<=(train_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

r2 = (train_pred2["y_obs"].corr(train_pred2["y_pred"]))    
print(("Train set results"))
print("r^2\t%.2f" % r2)
print ("RMSE\t%.2f" % sqrt(mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
print ("MSE\t%.2f" % (mean_squared_error(train_pred2["y_obs"],train_pred2["y_pred"])))
print ("MAE\t%.2f"  %mean_absolute_error(train_pred2["y_obs"],train_pred2["y_pred"]))   

test_pred = pd.DataFrame(y_test,y_pred_test) 
test_pred['y_pred'] = test_pred.index
test_pred = test_pred.rename(columns = {0: 'y_obs'})
test_pred2 = test_pred.reset_index(drop=True)
test_pred2['Folds'] = 'Train'
test_pred2 = test_pred2.assign(Folds_error = abs(test_pred2['y_pred'] - test_pred2['y_obs']))
test_pred2['Folds error Mean'] = test_pred2['Folds_error'].mean() 
test_pred2['Folds error 3*sigma'] = test_pred2['Folds_error'].std()
test_pred2['Folds error 3*sigma'] = test_pred2['Folds error 3*sigma']*3
test_pred2 = test_pred2[test_pred2['Folds_error']<=(test_pred2['Folds error 3*sigma'])] #keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.

r2 = (test_pred2["y_obs"].corr(test_pred2["y_pred"]))    
print(("Train set results"))
print("r^2\t%.2f" % r2)
print ("RMSE\t%.2f" % sqrt(mean_squared_error(test_pred2["y_obs"],test_pred2["y_pred"])))
print ("MSE\t%.2f" % (mean_squared_error(test_pred2["y_obs"],test_pred2["y_pred"])))
print ("MAE\t%.2f"  %mean_absolute_error(test_pred2["y_obs"],test_pred2["y_pred"])) 

Train set results
r^2	0.66
RMSE	0.79
MSE	0.62
MAE	0.59
Train set results
r^2	0.38
RMSE	0.81
MSE	0.66
MAE	0.59
