In [1]:

import sklearn as SK
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import *
from skopt import BayesSearchCV
import numpy as np
import pandas as pd
import scipy.stats
from math import sqrt
import os 
import io
from tensorboard.plugins.hparams import api as hp
from utils import utils

commons = utils.Commons()
shap_helper = utils.Shap_Helper()
ml_helper = utils.ML_Helper()
model_generator = utils.Model_Generator()
ml_helper.model_type = ml_helper.Regression # Always set this variable to the type of model you are building
model_generator.model_type = ml_helper.model_type # Always set this variable to the type of model you are building

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "-1"

In [2]:
#Inport train, validation and test sets
FOLD = 4
classification_path = "./data/Classification/"
regression_path = "./data/Regression/"

task_name = "Tb.brucei/"

scaffold_split = "scaffold_split/"
random_split = "random_split/"

split_type = scaffold_split

classification_path = classification_path + task_name + split_type
regression_path = regression_path + task_name + split_type

train_path = f"train_fold_{FOLD}.csv"
valid_path = f"valid_fold_{FOLD}.csv"
test_path = f"test_fold_{FOLD}.csv"

training = regression_path + train_path
validation = regression_path + valid_path
test = regression_path  + test_path
TASK_START = 2
NUM_TASKS = 1
SMILES = 'SMILES'
# Available Models are (LGBM,SVM,RF)
SELECTED_MODEL = model_generator.RF

train_dataset,y_train,train_smiles = commons.load_dataset(training,SMILES,TASK_START,NUM_TASKS)
valid_dataset,y_val,val_smiles = commons.load_dataset(validation,SMILES,TASK_START,NUM_TASKS)
test_dataset,y_test,test_smiles = commons.load_dataset(test,SMILES,TASK_START,NUM_TASKS)

train_dataset.head()

Loaded dataset ./data/Regression/Tb.brucei/scaffold_split/train_fold_4.csv with shape: (500, 3)
Loaded dataset ./data/Regression/Tb.brucei/scaffold_split/valid_fold_4.csv with shape: (63, 3)
Loaded dataset ./data/Regression/Tb.brucei/scaffold_split/test_fold_4.csv with shape: (62, 3)


Unnamed: 0,ID,SMILES,Tb.brucei
0,4317,CN(C1C=CC(C=CC=NNC(=S)N)=CC=1)C,4.751781
1,3961,CCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.69897
2,4169,OCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,5.387216
3,3988,CCCCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.39794
4,4002,CCCCCCCCOC(OCCC1C=C(O)C(O)=CC=1)=O,6.30103


In [3]:
# calculate ECFP (defaut) fingerprints using RDKit
RADIUS = 2 #diameter 4
FP_SIZE = 2048  #bit string size
FEAT = False #used when you consider pharmacophoric features
X_train = commons.assing_fp(train_smiles,FP_SIZE,RADIUS,FEAT)
X_test = commons.assing_fp(test_smiles,FP_SIZE,RADIUS,FEAT)

y_train = y_train.ravel()
y_train = np.array(y_train).astype(int)
y_test = y_test.ravel()
y_test = np.array(y_test).astype(int)


In [4]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scorer = make_scorer(mean_squared_error)

build_model = model_generator.Models[SELECTED_MODEL]
CLASSIFIER = build_model["classifier"]
PARAMS = build_model["params"]

# log-uniform: understand as search over p = exp(x) by varying x

best_model = BayesSearchCV(CLASSIFIER,
    PARAMS,
    n_iter=1, # Number of parameter settings that are sampled
    cv=cv,
    scoring = scorer,
    refit = True, # Refit the best estimator with the entire dataset.
    random_state=42,
    n_jobs = -1
)

best_model.fit(X_train, y_train)

print("Best parameters: %s" % best_model.best_params_)



Best parameters: OrderedDict([('max_depth', 5), ('max_features', 'sqrt'), ('n_estimators', 140)])


In [5]:
model = CLASSIFIER.set_params(**best_model.best_params_)
#model.probability = True
model.fit(X_train, y_train)


In [6]:
ml_helper.get_ML_StatsForNSplits(model,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Before 3 Sigma:
 Statistic       Value
-----------  --------
MSE          0.766
MAE          0.53
R2           0.224621

After 3 Sigma:

Mean error:  0.53

Average std error:  0.696491205974634

Drop list size:  290

Statistic         Value
-----------  ----------
MSE           1.41429
MAE           1.1381
R2           -0.0442164

Before 3 Sigma:
 Statistic        Value
-----------  ---------
MSE           1.37097
MAE           0.758065
R2           -0.230159

After 3 Sigma:

Mean error:  0.7580645161290323

Average std error:  0.892359754427294

Drop list size:  34

Statistic        Value
-----------  ---------
MSE           1.75
MAE           1.25
R2           -0.465812

