In [20]:

import sklearn as SK
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import *
from skopt import BayesSearchCV
import numpy as np
import pandas as pd
import scipy.stats
from math import sqrt
import os 
import io
from tensorboard.plugins.hparams import api as hp
from utils import utils

commons = utils.Commons()
shap_helper = utils.Shap_Helper()
ml_helper = utils.ML_Helper()
model_generator = utils.Model_Generator()
ml_helper.model_type = ml_helper.Regression # Always set this variable to the type of model you are building
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "-1"

In [21]:
#Inport train, validation and test sets
FOLD = 4
task = 0
path = ".\data\Eufrasia\splits\Random"
training = [path+"/"+train for train in os.listdir(path) if train.find("train")!=-1 and train.endswith(str(FOLD)+".csv")]
validation = [path+"/"+val for val in os.listdir(path) if val.find("val")!=-1 and val.endswith(str(FOLD)+".csv")] 
test = [path+"/"+test for test in os.listdir(path) if test.find("test")!=-1 and test.endswith(str(FOLD)+".csv")]
TASK_START = 2
NUM_TASKS = 1
SMILES = 'SMILES'
# Available Models are (LGBM,SVM,RF)
SELECTED_MODEL = model_generator.SVM

train_dataset,y_train,train_smiles = commons.load_dataset(training[task],SMILES,TASK_START,NUM_TASKS)
valid_dataset,y_val,val_smiles = commons.load_dataset(validation[task],SMILES,TASK_START,NUM_TASKS)
test_dataset,y_test,test_smiles = commons.load_dataset(test[task],SMILES,TASK_START,NUM_TASKS)

train_dataset.head()

Loaded dataset .\data\Eufrasia\splits\Random/final-220-NEW-1_train_4.csv with shape: (3754, 3)
Loaded dataset .\data\Eufrasia\splits\Random/final-220-NEW-1_valid_4.csv with shape: (469, 3)
Loaded dataset .\data\Eufrasia\splits\Random/final-220-NEW-1_test_4.csv with shape: (470, 3)


Unnamed: 0,ID,SMILES,VALUE_p
0,1,NC1=C2CCCCC2=NC2=CC=CC=C12,6.318759
1,2,COC1(C2=CN(O)C(=O)C(C(=O)C3C(C)C=CC4CC(C)CCC43...,6.091515
2,3,O=S(=O)(N=C(NCCCCNC1=C2CCCCC2=NC2=CC=CC=C12)N1...,5.599999
3,4,O=C1C2=CC=CC=C2C(=O)N1CCCCCCNCC1=CC=CC=C1,5.676748
4,5,O=C1C2=CC=CC=C2C(=O)N1CCCCCCNCC1=CC=CC=C1F,6.02641


In [22]:
# calculate ECFP (defaut) fingerprints using RDKit
RADIUS = 2 #diameter 4
FP_SIZE = 2048  #bit string size
FEAT = False #used when you consider pharmacophoric features
X_train = commons.assing_fp(train_smiles,FP_SIZE,RADIUS,FEAT)
X_test = commons.assing_fp(test_smiles,FP_SIZE,RADIUS,FEAT)

y_train = y_train.ravel()
y_train = np.array(y_train).astype(int)
y_test = y_test.ravel()
y_test = np.array(y_test).astype(int)


In [23]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scorer = make_scorer(mean_squared_error)

build_model = model_generator.Models[SELECTED_MODEL]
CLASSIFIER = build_model["classifier"]
PARAMS = build_model["params"]

# log-uniform: understand as search over p = exp(x) by varying x

best_model = BayesSearchCV(CLASSIFIER,
    PARAMS,
    n_iter=1, # Number of parameter settings that are sampled
    cv=cv,
    scoring = scorer,
    refit = True, # Refit the best estimator with the entire dataset.
    random_state=42,
    n_jobs = -1
)

best_model.fit(X_train, y_train)

print("Best parameters: %s" % best_model.best_params_)



Best parameters: OrderedDict([('C', 0.0410104548749355), ('degree', 6), ('kernel', 'sigmoid')])


In [24]:
model = CLASSIFIER.set_params(**best_model.best_params_)
#model.probability = True
model.fit(X_train, y_train)


In [25]:
text = ml_helper.get_ML_StatsForNSplits(model,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Before 3 Sigma:
 Statistic         Value
-----------  ----------
MSE           3.01332
MAE           1.23388
R2           -0.0779146

After 3 Sigma:

Mean error:  1.2338838572189665

Average std error:  1.2210035844172458

Drop list size:  1161

Statistic         Value
-----------  ----------
MSE           2.89549
MAE           1.54879
R2           -0.0963466

Before 3 Sigma:
 Statistic        Value
-----------  ---------
MSE           2.93404
MAE           1.22766
R2           -0.115448

After 3 Sigma:

Mean error:  1.2276595744680852

Average std error:  1.1945269031747796

Drop list size:  152

Statistic        Value
-----------  ---------
MSE           2.59119
MAE           1.4717
R2           -0.137548



In [26]:
with open(f"results_for {SELECTED_MODEL}.txt", "a+") as f:
    in_this_task = training[task].replace("train","").replace(path,"").replace(".csv","").replace("/","").replace(f"__{FOLD}","")
    f.write(f"{in_this_task} Fold {FOLD}:\n {text}\r")