# 1. Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import MACCSkeys
from copy import deepcopy
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Data entry and curation work set

In [4]:
uploaded_file_ws="datasets/KRAS_work_from_insilico.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pIC50_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  454 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [7]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [9]:
len(y_tr)

454

In [299]:
y_tr=np.array(y_tr)

# Standardization SDF file for work set

In [12]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  454 molecules


In [33]:
moldf_ws=pd.DataFrame(moldf_ws, columns=['Mol'])
moldf_ws

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
1,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
2,<rdkit.Chem.rdchem.Mol object at 0x000001DC567...
3,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
4,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
...,...
449,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
450,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
451,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
452,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...


# Data entry and curation test set

In [17]:
uploaded_file_ts="datasets/KRAS_test_from_insilico.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pIC50_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  114 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [20]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [22]:
len(y_ts)

114

In [289]:
y_ts=np.array(y_ts)

# Standardization SDF file for test set

In [25]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  114 molecules


In [27]:
moldf_ts=pd.DataFrame(moldf_ts, columns=['Mol'])
moldf_ts

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
1,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
2,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
3,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
4,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
...,...
109,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
110,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
111,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...
112,<rdkit.Chem.rdchem.Mol object at 0x000001DC6A2...


# Calculation RDKit_2D descriptors for work set

In [29]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
header = calc.GetDescriptorNames()

In [35]:
descr_tr= []
for m in moldf_ws.Mol:
    descr_tr.append(calc.CalcDescriptors(m))
x_tr = np.asarray(descr_tr)

In [37]:
df_RDKit_2D = pd.DataFrame(x_tr,columns=header)

In [39]:
df_RDKit_2D.head(2)

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,16.578266,16.578266,0.130515,-0.52942,0.355629,25.463415,573.072,542.832,572.21028,210.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.476711,16.476711,0.059973,-0.56881,0.234786,18.095238,584.051,556.835,583.178646,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df_RDKit_2D.isna().mean().sort_values(ascending=False).head(15)

BCUT2D_MWHI            0.002203
MaxPartialCharge       0.002203
BCUT2D_MWLOW           0.002203
BCUT2D_LOGPHI          0.002203
BCUT2D_LOGPLOW         0.002203
BCUT2D_MRHI            0.002203
BCUT2D_MRLOW           0.002203
MinAbsPartialCharge    0.002203
MaxAbsPartialCharge    0.002203
MinPartialCharge       0.002203
BCUT2D_CHGHI           0.002203
BCUT2D_CHGLO           0.002203
fr_Ndealkylation2      0.000000
fr_Ndealkylation1      0.000000
fr_N_O                 0.000000
dtype: float64

In [43]:
df_RDKit_2D=df_RDKit_2D.dropna(axis=1)
df_RDKit_2D

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,16.578266,16.578266,0.130515,-0.529420,0.355629,25.463415,573.072,542.832,572.210280,210.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.476711,16.476711,0.059973,-0.568810,0.234786,18.095238,584.051,556.835,583.178646,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16.122576,16.122576,0.116511,-0.523820,0.449128,19.794118,478.959,454.767,478.168415,174.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16.447043,16.447043,0.178167,-0.456371,0.327768,18.666667,520.612,491.380,520.238688,196.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16.443350,16.443350,0.100681,-1.187623,0.477357,28.425000,550.654,514.366,550.286781,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,16.725593,16.725593,0.007777,-0.912746,0.319044,26.409091,600.645,569.397,600.246059,226.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
450,16.621743,16.621743,0.010826,-0.581066,0.332458,23.512195,575.088,542.832,574.225930,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
451,16.887009,16.887009,0.030230,-0.645247,0.269750,24.714286,667.761,628.449,667.308245,254.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
452,15.304572,15.304572,0.075967,-4.807739,0.306165,30.571429,592.613,559.349,592.258515,226.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
x_tr= df_RDKit_2D.to_numpy ()

In [47]:
# Data Standardization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)


In [49]:
x_tr.shape

(454, 198)

In [51]:
savetxt('models/RDKiT/x_tr_RDKiT.csv', x_tr, delimiter=',')

In [279]:
x_tr = np.array(x_tr, dtype=np.float32)
x_tr = np.array(x_tr, dtype=np.float32)

# Calculation RDKit_2D descriptors for test set

In [53]:
descr_ts = []
for m in moldf_ts.Mol:
    descr_ts.append(calc.CalcDescriptors(m))
x_ts = np.asarray(descr_ts)

In [55]:
x_ts.shape

(114, 210)

In [57]:
df_RDKit_2D_ts = pd.DataFrame(x_ts,columns=header)
df_RDKit_2D_ts=df_RDKit_2D_ts.dropna(axis=1)
df_RDKit_2D_ts

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,16.546704,16.546704,0.114958,-0.502508,0.313304,22.150000,556.045,528.829,555.194964,202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.265427,16.265427,0.124335,-0.521930,0.393832,20.277778,507.997,480.781,507.183731,186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16.372442,16.372442,0.105215,-0.848130,0.502232,28.538462,535.643,500.363,535.287115,206.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15.416565,15.416565,0.195899,-0.945490,0.512666,24.948718,536.627,502.355,536.271131,206.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16.356441,16.356441,0.136609,-0.519813,0.336462,18.947368,530.007,504.807,529.179314,192.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,16.981940,16.981940,0.048577,-0.563697,0.257306,24.897959,669.802,625.450,669.343881,258.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,16.775584,16.775584,0.004277,-0.635330,0.313365,23.000000,586.687,550.399,586.286781,224.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,16.628546,16.628546,0.017972,-0.701058,0.317308,23.261905,593.078,561.830,592.216508,218.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,16.470194,16.470194,0.005035,-0.593601,0.369208,23.256410,549.050,518.810,548.210280,202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
x_ts= df_RDKit_2D_ts.to_numpy ()

In [61]:
x_ts = scale.transform(x_ts)

In [63]:
x_ts.shape

(114, 198)

# CatBoostRegressor

In [66]:
cv=KFold(n_splits=5, random_state=42, shuffle=True)

In [68]:
%%time
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100,500, 1000]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1, cv = cv)
grid.fit(x_tr, y_tr, verbose=False)

CPU times: total: 2min 7s
Wall time: 39min 35s


0,1,2
,estimator,<catboost.cor...001DC6A2C72F0>
,param_grid,"{'depth': [6, 8, ...], 'iterations': [100, 500, ...], 'learning_rate': [0.01, 0.05, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False


In [69]:
best_CatBR = grid.best_estimator_

In [70]:
grid.best_params_

{'depth': 6, 'iterations': 1000, 'learning_rate': 0.1}

In [71]:
y_pred_ws_GBR = best_CatBR.predict(x_tr)

In [72]:
R2_WS = round(r2_score(y_tr, y_pred_ws_GBR), 2)
R2_WS

1.0

In [73]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_GBR)), 2)
RMSE_WS

0.01

In [74]:
params={'verbose': False}

In [105]:
%%time
y_pred_CV_CatBR = cross_val_predict(best_CatBR, x_tr, y_tr, cv=cv, params=params)

CPU times: total: 3min 22s
Wall time: 15.9 s


In [106]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_CatBR), 2)
Q2_CV

0.59

In [107]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_CatBR)), 2)
RMSE_CV

0.78

# save the model to disk

In [108]:
pickle.dump(best_CatBR, open('Models/RDKiT/CatBoost_RDKiT.pkl', 'wb'))

# load the model from disk

In [69]:
best_CatBR = pickle.load(open('Models/RDKiT/CatBoost_RDKiT.pkl', 'rb'))

# Prediction for test set's molecules

In [110]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [111]:
y_pred_GBR = best_CatBR.predict(x_ts)

In [112]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.66

In [113]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.72

# Estimating applicability domain. Method - Euclidian distances, K=1

In [123]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [125]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,444,445,446,447,448,449,450,451,452,453
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,7.109785,14.086133,4.286247,5.429664,6.402662,7.659821,4.547579,11.186853,1.203329,14.101406,...,4.765282,6.358703,9.202180,1.169768,4.921871,4.921871,4.395028,8.129132,3.738925,3.774373
2,7.790864,14.170631,7.869674,5.692342,7.143091,9.029150,5.692505,12.282191,7.356095,14.614542,...,5.203435,6.720472,10.357517,5.181025,5.427346,5.059224,5.440297,9.202180,4.007168,3.969518
3,8.609716,14.234864,8.125074,6.289952,7.525383,9.040010,5.939624,12.317505,12.442009,14.735225,...,5.435645,6.776452,10.450316,7.041709,5.969554,6.176168,5.542034,10.664644,5.425872,4.432050
4,8.723624,14.254034,8.736418,6.894726,7.591179,9.063340,7.240980,12.465255,13.421886,15.589043,...,6.948438,6.823228,11.932958,7.343806,6.352273,7.009497,5.745015,11.609781,6.395760,4.551218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,26.899402,29.488703,31.342591,28.688984,26.461467,27.776103,29.798038,35.634232,29.298387,32.407568,...,27.231975,26.625942,31.664988,28.518571,26.273328,26.769810,25.702813,28.763580,29.920557,27.514293
450,26.945686,29.771508,31.726254,28.938065,26.924542,28.176175,29.913582,36.120570,30.788403,32.813304,...,27.652883,27.070109,31.732357,29.251441,26.702223,27.742892,25.851647,29.379137,30.707710,27.746439
451,32.308029,33.585506,38.747117,34.945668,34.193879,32.314718,35.917757,41.487540,31.256874,38.442239,...,33.908285,33.259797,31.807279,34.962639,32.680726,32.649063,32.744000,30.716798,35.908859,34.733846
452,34.247246,36.314845,38.824608,36.745295,34.843246,36.428047,36.599580,41.825771,35.913452,38.687807,...,34.555717,34.556647,33.216681,35.841475,32.889616,32.784981,33.458058,32.817031,37.280184,35.336661


In [127]:
similarity= neighbors_k

In [129]:
Dmean=np.mean(similarity[1,:])

In [131]:
round(Dmean, 2)

5.7

In [133]:
std=np.std(similarity[1,:])

In [135]:
round(std, 2)

3.98

In [137]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

7.69


In [139]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [141]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,0.540480,6.266093,5.166173,10.016746,4.414391,6.151157,2.925961,7.790664,7.753076,4.418320,...,1.105312,3.858811,7.770877,6.908525,4.636582,5.787290,4.828034,4.410233,2.806059,4.113101
1,10.095708,6.611399,5.290582,10.026763,4.433574,6.316852,5.248201,8.380529,10.034478,5.616930,...,2.817697,5.803622,7.977025,7.551102,6.062179,8.391107,5.426740,4.909793,4.704615,4.727278
2,10.733137,6.685981,5.592552,10.168751,4.465312,6.410349,5.470187,8.460642,15.252159,6.339448,...,4.172054,7.588089,8.018930,11.036249,6.303319,8.540963,5.427137,5.272047,5.347308,6.399133
3,10.796118,6.823902,6.606743,10.324718,4.905893,8.128436,5.992155,8.540839,17.490673,7.141113,...,5.215362,8.643370,8.206103,11.619511,6.379526,8.714859,5.611925,6.016758,5.666342,6.509562
4,10.802985,6.865792,6.705949,10.343781,6.788551,8.131520,6.241532,8.589679,18.051153,7.293300,...,6.272814,8.659468,8.238439,12.037764,6.456661,10.194406,6.237067,6.341238,6.259065,6.541259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,28.284485,29.466147,27.302204,28.388773,28.359696,29.077018,29.046594,29.456841,32.824306,28.908651,...,30.105742,26.479649,30.371409,30.402452,26.867709,30.050027,26.095854,25.954432,26.857913,26.349289
450,28.336117,29.576844,27.452904,28.558381,28.894754,29.390569,29.125393,30.170856,32.842191,29.241185,...,30.624022,26.855710,31.172341,31.356780,27.037360,30.304590,26.941464,26.729097,27.096141,26.536508
451,34.630423,36.472079,35.095911,34.554527,34.768162,33.780198,36.243474,36.601657,36.229145,36.104114,...,35.864364,33.357380,32.263494,31.499284,32.119894,30.513825,32.213522,32.560316,33.508046,32.854756
452,36.527416,37.640167,35.843812,35.488548,36.971409,36.671323,37.524739,37.419429,38.599912,37.512646,...,37.100654,33.611691,32.421658,32.817179,33.406222,33.461141,32.900519,33.389720,35.047071,33.346265


In [143]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 0.54   6.266  5.166 10.017  4.414  6.151  2.926  7.791  7.753  4.418
  9.613  8.526  2.677  5.808  6.793  3.524  7.466  7.231  2.887  4.079
  6.969  6.976  9.574  4.947  4.524  5.779  9.807  2.998  6.808  1.204
  1.493  4.744  5.749 18.048  4.157  5.643  2.95  11.023  8.834  1.29
  2.811  5.996  3.668  3.6    3.087  3.946  4.115  4.049  5.496  6.922
  7.21   3.575  6.327  3.026  5.546  5.564  1.219  4.324  1.714  4.657
  5.384  6.748  2.055  1.233  2.638  3.628  2.514  2.295  2.264  4.58
  7.2    4.58   2.403 13.535  7.173  5.198  2.487  5.197  4.292  6.662
  4.799  4.17   3.405  1.471  6.114  6.515  6.738  7.531  4.643 11.999
  5.422  4.598 29.729  3.474  5.475  6.962  2.103  4.896  7.213 31.184
  4.763  5.488  9.976  4.595  1.105  3.859  7.771  6.909  4.637  5.787
  4.828  4.41   2.806  4.113]


In [145]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True False  True  True  True False False  True False False
  True  True  True  True  True  True  True  True  True  True False  True
  True  True False  True  True  True  True  True  True False  True  True
  True False False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True False  True  True  True
  True  True  True False  True  True False  True  True  True False  True
  True  True  True  True  True  True]


In [147]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.86


In [149]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   4   5   6   9  12  13  14  15  16  17  18  19  20  21  23
  24  25  27  28  29  30  31  32  34  35  36  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
  64  65  66  67  68  69  70  71  72  74  75  76  77  78  79  80  81  82
  83  84  85  86  87  88  90  91  93  94  95  96  97  98 100 101 103 104
 105 107 108 109 110 111 112 113]


In [151]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [156]:
y_pred_GBR_ad=list(y_pred_GBR)

In [158]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [160]:
len(y_pred_GBR_ad)

98

In [162]:
y_ts_ad=list(y_ts)

In [164]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [166]:
len(y_ts_ad)

98

In [168]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.65

In [170]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.7

# SVM model building and validation

In [173]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [175]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [177]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [179]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


0,1,2
,estimator,SVR(epsilon=0.2)
,param_grid,"{'C': [1, 10, ...], 'gamma': [1e-06, 1e-05, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.001
,coef0,0.0
,tol,0.001
,C,10
,epsilon,0.2
,shrinking,True
,cache_size,200
,verbose,False


In [180]:
svm.best_params_
best_svm = svm.best_estimator_

In [181]:
svm.best_params_

{'C': 10, 'gamma': 0.001}

In [182]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [183]:
R2_WS = round(r2_score(y_tr, y_pred_ws_svm), 2)
R2_WS

0.76

In [184]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_svm)), 2)
RMSE_WS

0.6

In [185]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [186]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.58

In [187]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.8

# Prediction for test set's molecules

In [207]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [209]:
y_pred_svm = best_svm.predict(x_ts)

In [211]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.57

In [192]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.81

save the model to disk

In [213]:
pickle.dump(best_svm, open('models/RDKiT/SVM_RDKiT.pkl', 'wb'))

load the model from disk

In [105]:
best_svm = pickle.load(open('models/RDKiT/SVM_RDKiT.pkl', 'rb'))

# Estimating applicability domain. Method - Euclidian distances, K=1

In [216]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [218]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,444,445,446,447,448,449,450,451,452,453
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,7.109785,14.086133,4.286247,5.429664,6.402662,7.659821,4.547579,11.186853,1.203329,14.101406,...,4.765282,6.358703,9.202180,1.169768,4.921871,4.921871,4.395028,8.129132,3.738925,3.774373
2,7.790864,14.170631,7.869674,5.692342,7.143091,9.029150,5.692505,12.282191,7.356095,14.614542,...,5.203435,6.720472,10.357517,5.181025,5.427346,5.059224,5.440297,9.202180,4.007168,3.969518
3,8.609716,14.234864,8.125074,6.289952,7.525383,9.040010,5.939624,12.317505,12.442009,14.735225,...,5.435645,6.776452,10.450316,7.041709,5.969554,6.176168,5.542034,10.664644,5.425872,4.432050
4,8.723624,14.254034,8.736418,6.894726,7.591179,9.063340,7.240980,12.465255,13.421886,15.589043,...,6.948438,6.823228,11.932958,7.343806,6.352273,7.009497,5.745015,11.609781,6.395760,4.551218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,26.899402,29.488703,31.342591,28.688984,26.461467,27.776103,29.798038,35.634232,29.298387,32.407568,...,27.231975,26.625942,31.664988,28.518571,26.273328,26.769810,25.702813,28.763580,29.920557,27.514293
450,26.945686,29.771508,31.726254,28.938065,26.924542,28.176175,29.913582,36.120570,30.788403,32.813304,...,27.652883,27.070109,31.732357,29.251441,26.702223,27.742892,25.851647,29.379137,30.707710,27.746439
451,32.308029,33.585506,38.747117,34.945668,34.193879,32.314718,35.917757,41.487540,31.256874,38.442239,...,33.908285,33.259797,31.807279,34.962639,32.680726,32.649063,32.744000,30.716798,35.908859,34.733846
452,34.247246,36.314845,38.824608,36.745295,34.843246,36.428047,36.599580,41.825771,35.913452,38.687807,...,34.555717,34.556647,33.216681,35.841475,32.889616,32.784981,33.458058,32.817031,37.280184,35.336661


In [220]:
similarity= neighbors_k

In [222]:
Dmean=np.mean(similarity[1,:])

In [224]:
round(Dmean, 2)

5.7

In [226]:
std=np.std(similarity[1,:])

In [228]:
round(std, 2)

3.98

In [230]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

7.69


In [232]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [234]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,0.540480,6.266093,5.166173,10.016746,4.414391,6.151157,2.925961,7.790664,7.753076,4.418320,...,1.105312,3.858811,7.770877,6.908525,4.636582,5.787290,4.828034,4.410233,2.806059,4.113101
1,10.095708,6.611399,5.290582,10.026763,4.433574,6.316852,5.248201,8.380529,10.034478,5.616930,...,2.817697,5.803622,7.977025,7.551102,6.062179,8.391107,5.426740,4.909793,4.704615,4.727278
2,10.733137,6.685981,5.592552,10.168751,4.465312,6.410349,5.470187,8.460642,15.252159,6.339448,...,4.172054,7.588089,8.018930,11.036249,6.303319,8.540963,5.427137,5.272047,5.347308,6.399133
3,10.796118,6.823902,6.606743,10.324718,4.905893,8.128436,5.992155,8.540839,17.490673,7.141113,...,5.215362,8.643370,8.206103,11.619511,6.379526,8.714859,5.611925,6.016758,5.666342,6.509562
4,10.802985,6.865792,6.705949,10.343781,6.788551,8.131520,6.241532,8.589679,18.051153,7.293300,...,6.272814,8.659468,8.238439,12.037764,6.456661,10.194406,6.237067,6.341238,6.259065,6.541259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,28.284485,29.466147,27.302204,28.388773,28.359696,29.077018,29.046594,29.456841,32.824306,28.908651,...,30.105742,26.479649,30.371409,30.402452,26.867709,30.050027,26.095854,25.954432,26.857913,26.349289
450,28.336117,29.576844,27.452904,28.558381,28.894754,29.390569,29.125393,30.170856,32.842191,29.241185,...,30.624022,26.855710,31.172341,31.356780,27.037360,30.304590,26.941464,26.729097,27.096141,26.536508
451,34.630423,36.472079,35.095911,34.554527,34.768162,33.780198,36.243474,36.601657,36.229145,36.104114,...,35.864364,33.357380,32.263494,31.499284,32.119894,30.513825,32.213522,32.560316,33.508046,32.854756
452,36.527416,37.640167,35.843812,35.488548,36.971409,36.671323,37.524739,37.419429,38.599912,37.512646,...,37.100654,33.611691,32.421658,32.817179,33.406222,33.461141,32.900519,33.389720,35.047071,33.346265


In [235]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 0.54   6.266  5.166 10.017  4.414  6.151  2.926  7.791  7.753  4.418
  9.613  8.526  2.677  5.808  6.793  3.524  7.466  7.231  2.887  4.079
  6.969  6.976  9.574  4.947  4.524  5.779  9.807  2.998  6.808  1.204
  1.493  4.744  5.749 18.048  4.157  5.643  2.95  11.023  8.834  1.29
  2.811  5.996  3.668  3.6    3.087  3.946  4.115  4.049  5.496  6.922
  7.21   3.575  6.327  3.026  5.546  5.564  1.219  4.324  1.714  4.657
  5.384  6.748  2.055  1.233  2.638  3.628  2.514  2.295  2.264  4.58
  7.2    4.58   2.403 13.535  7.173  5.198  2.487  5.197  4.292  6.662
  4.799  4.17   3.405  1.471  6.114  6.515  6.738  7.531  4.643 11.999
  5.422  4.598 29.729  3.474  5.475  6.962  2.103  4.896  7.213 31.184
  4.763  5.488  9.976  4.595  1.105  3.859  7.771  6.909  4.637  5.787
  4.828  4.41   2.806  4.113]


In [238]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True False  True  True  True False False  True False False
  True  True  True  True  True  True  True  True  True  True False  True
  True  True False  True  True  True  True  True  True False  True  True
  True False False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True False  True  True  True
  True  True  True False  True  True False  True  True  True False  True
  True  True  True  True  True  True]


In [240]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.86


In [242]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   4   5   6   9  12  13  14  15  16  17  18  19  20  21  23
  24  25  27  28  29  30  31  32  34  35  36  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
  64  65  66  67  68  69  70  71  72  74  75  76  77  78  79  80  81  82
  83  84  85  86  87  88  90  91  93  94  95  96  97  98 100 101 103 104
 105 107 108 109 110 111 112 113]


In [244]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [246]:
y_pred_svm_ad=list(y_pred_svm)

In [249]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [251]:
len(y_pred_svm_ad)

98

In [253]:
y_ts_ad=list(y_ts)

In [255]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [257]:
len(y_ts_ad)

98

In [259]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.6

In [261]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.75

# Multi-layer Perceptron regressor

In [264]:
from sklearn.neural_network import MLPRegressor

In [266]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [268]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100),(100, 100, 100), (10, 10, 10),(50,)], "activation": ["tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005], 'max_iter': [1000, 2000]}

In [275]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [301]:
x_tr=x_tr.astype(float)
y_tr=y_tr.astype(float)

In [303]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


0,1,2
,estimator,MLPRegressor()
,param_grid,"{'activation': ['tanh', 'relu'], 'alpha': [5e-05, 0.0005], 'hidden_layer_sizes': [(400, ...), (100, ...), ...], 'max_iter': [1000, 2000], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(10, ...)"
,activation,'tanh'
,solver,'sgd'
,alpha,5e-05
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,1000


In [304]:
best_MLPR = m.best_estimator_

In [305]:
m.best_params_

{'activation': 'tanh',
 'alpha': 5e-05,
 'hidden_layer_sizes': (10, 10, 10),
 'max_iter': 1000,
 'solver': 'sgd'}

In [306]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [307]:
R2_WS = round(r2_score(y_tr, y_pred_ws_MLPR), 2)
R2_WS

0.91

In [308]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_MLPR)), 2)
RMSE_WS

0.36

In [309]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [310]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.46

In [311]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.9

# Prediction for test set's molecules

In [313]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [314]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [315]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.49

In [316]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.88

# save the model to disk

In [330]:
pickle.dump(best_MLPR, open('models/RDKiT/MLPR_RDKiT.pkl', 'wb'))

# load the model from disk

In [232]:
best_MLPR = pickle.load(open('models/RDKiT/MLPR_RDKiT.pkl', 'rb'))

# Estimating applicability domain. Method - Euclidian distances, K=1

In [333]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [334]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,444,445,446,447,448,449,450,451,452,453
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,7.109785,14.086133,4.286247,5.429664,6.402662,7.659822,4.547579,11.186853,1.203329,14.101406,...,4.765282,6.358703,9.202180,1.169768,4.921871,4.921871,4.395028,8.129132,3.738925,3.774373
2,7.790864,14.170631,7.869674,5.692342,7.143091,9.029150,5.692505,12.282191,7.356095,14.614541,...,5.203435,6.720472,10.357517,5.181025,5.427346,5.059224,5.440297,9.202180,4.007168,3.969518
3,8.609716,14.234864,8.125074,6.289952,7.525383,9.040010,5.939624,12.317505,12.442008,14.735225,...,5.435645,6.776452,10.450315,7.041709,5.969554,6.176168,5.542034,10.664644,5.425872,4.432050
4,8.723624,14.254034,8.736418,6.894726,7.591179,9.063340,7.240980,12.465255,13.421886,15.589043,...,6.948438,6.823228,11.932959,7.343806,6.352273,7.009497,5.745015,11.609781,6.395760,4.551218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,26.899402,29.488703,31.342591,28.688984,26.461467,27.776103,29.798038,35.634232,29.298387,32.407568,...,27.231974,26.625942,31.664988,28.518571,26.273327,26.769809,25.702813,28.763580,29.920557,27.514293
450,26.945686,29.771508,31.726254,28.938064,26.924542,28.176176,29.913581,36.120570,30.788403,32.813304,...,27.652883,27.070109,31.732357,29.251441,26.702224,27.742893,25.851648,29.379137,30.707710,27.746439
451,32.308028,33.585506,38.747116,34.945668,34.193879,32.314718,35.917757,41.487540,31.256875,38.442238,...,33.908284,33.259797,31.807280,34.962638,32.680725,32.649062,32.744000,30.716798,35.908859,34.733845
452,34.247246,36.314845,38.824607,36.745295,34.843246,36.428046,36.599580,41.825771,35.913451,38.687807,...,34.555717,34.556646,33.216681,35.841475,32.889616,32.784981,33.458058,32.817031,37.280184,35.336661


In [336]:
similarity= neighbors_k

In [339]:
Dmean=np.mean(similarity[1,:])

In [340]:
round(Dmean, 2)

5.7

In [342]:
std=np.std(similarity[1,:])

In [344]:
round(std, 2)

3.98

In [345]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

7.69


In [347]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [350]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,0.540480,6.266093,5.166173,10.016746,4.414391,6.151157,2.925961,7.790663,7.753077,4.418320,...,1.105312,3.858811,7.770877,6.908525,4.636582,5.787290,4.828034,4.410233,2.806059,4.113101
1,10.095708,6.611399,5.290582,10.026763,4.433574,6.316852,5.248201,8.380529,10.034478,5.616930,...,2.817697,5.803622,7.977025,7.551102,6.062179,8.391107,5.426740,4.909793,4.704615,4.727278
2,10.733137,6.685981,5.592552,10.168751,4.465312,6.410349,5.470187,8.460642,15.252159,6.339448,...,4.172054,7.588089,8.018930,11.036249,6.303319,8.540963,5.427137,5.272047,5.347308,6.399133
3,10.796118,6.823902,6.606743,10.324718,4.905893,8.128436,5.992155,8.540839,17.490673,7.141113,...,5.215362,8.643370,8.206103,11.619511,6.379526,8.714859,5.611925,6.016758,5.666342,6.509562
4,10.802985,6.865792,6.705949,10.343781,6.788551,8.131520,6.241532,8.589679,18.051153,7.293300,...,6.272814,8.659468,8.238439,12.037764,6.456661,10.194406,6.237067,6.341238,6.259065,6.541259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,28.284485,29.466147,27.302204,28.388773,28.359696,29.077019,29.046594,29.456840,32.824306,28.908650,...,30.105741,26.479649,30.371409,30.402452,26.867709,30.050027,26.095854,25.954432,26.857913,26.349289
450,28.336117,29.576844,27.452903,28.558381,28.894754,29.390569,29.125393,30.170856,32.842191,29.241184,...,30.624022,26.855711,31.172341,31.356780,27.037361,30.304591,26.941464,26.729098,27.096140,26.536507
451,34.630423,36.472079,35.095911,34.554526,34.768162,33.780197,36.243474,36.601657,36.229145,36.104113,...,35.864364,33.357380,32.263494,31.499285,32.119893,30.513825,32.213522,32.560316,33.508046,32.854755
452,36.527416,37.640167,35.843811,35.488547,36.971409,36.671323,37.524739,37.419428,38.599912,37.512646,...,37.100654,33.611690,32.421657,32.817178,33.406222,33.461141,32.900519,33.389719,35.047071,33.346264


In [351]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 0.54   6.266  5.166 10.017  4.414  6.151  2.926  7.791  7.753  4.418
  9.613  8.526  2.677  5.808  6.793  3.524  7.466  7.231  2.887  4.079
  6.969  6.976  9.574  4.947  4.524  5.779  9.807  2.998  6.808  1.204
  1.493  4.744  5.749 18.048  4.157  5.643  2.95  11.023  8.834  1.29
  2.811  5.996  3.668  3.6    3.087  3.946  4.115  4.049  5.496  6.922
  7.21   3.575  6.327  3.026  5.546  5.564  1.219  4.324  1.714  4.657
  5.384  6.748  2.055  1.233  2.638  3.628  2.514  2.295  2.264  4.58
  7.2    4.58   2.403 13.535  7.173  5.198  2.487  5.197  4.292  6.662
  4.799  4.17   3.405  1.471  6.114  6.515  6.738  7.531  4.643 11.999
  5.422  4.598 29.729  3.474  5.475  6.962  2.103  4.896  7.213 31.184
  4.763  5.488  9.976  4.595  1.105  3.859  7.771  6.909  4.637  5.787
  4.828  4.41   2.806  4.113]


In [353]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True False  True  True  True False False  True False False
  True  True  True  True  True  True  True  True  True  True False  True
  True  True False  True  True  True  True  True  True False  True  True
  True False False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True False  True  True  True
  True  True  True False  True  True False  True  True  True False  True
  True  True  True  True  True  True]


In [354]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.86


In [359]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   4   5   6   9  12  13  14  15  16  17  18  19  20  21  23
  24  25  27  28  29  30  31  32  34  35  36  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
  64  65  66  67  68  69  70  71  72  74  75  76  77  78  79  80  81  82
  83  84  85  86  87  88  90  91  93  94  95  96  97  98 100 101 103 104
 105 107 108 109 110 111 112 113]


In [361]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [364]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [366]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [368]:
len(y_pred_MLPR_ad)

98

In [370]:
y_ts_ad=list(y_ts)

In [372]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [374]:
len(y_ts_ad)

98

In [376]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.51

In [378]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.83