In [1]:
# === Basic libraries ===
import numpy as np
import pandas as pd
import time

# === Feature generation ===
from rdkit import Chem
from mordred import Calculator, descriptors
from sklearn import preprocessing as pp
import seaborn as sns
from sklearn.decomposition import PCA

# === Classifiers ===
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, cv, DMatrix

# === Metrics and cross-validation ====
from sklearn import metrics as met
from sklearn.model_selection import cross_val_predict, LeaveOneOut, KFold

# === Neural networks ===
import tensorflow as tf
from keras import Model, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.data import Iterator

In [None]:
# === Read in files with X,Y ===
RS_XY = pd.read_csv('rsxy_v1.csv')
clist = list(RS_XY['SMILES'])
y = np.array(RS_XY['Sens'])
y = np.reshape(y,(-1,1))
yr = y.ravel()
X = np.zeros(shape=(len(clist),1826))

# === Calculate descriptors ===
calc = Calculator(descriptors)
dlist = list(calc._name_dict.keys())
for i in range(len(clist)):
    mol = Chem.MolFromSmiles(clist[i])
    X[i,:] = calc(mol)
sh1 = np.shape(X)
print(f'Shape | raw: {sh1}')

In [None]:
# === Filter data and scale ===
X0 = X
X = X[:,~np.any(np.isnan(X), axis=0)]
X = X[:, np.var(X, axis=0) != 0]
scaler = pp.MinMaxScaler().fit(X)
Xs = scaler.transform(X)
sh2 = np.shape(Xs)
print(f'Shape | filtered/scaled: {sh2}')

In [None]:
# === Filter data and scale ===
X2 = pd.DataFrame(X0, columns=dlist)
X2.to_csv('temp_out/X.csv', header=True)
X2 = X2.dropna(axis=1)
X2 = X2.loc[:, X2.var()!=0]
X2.shape
X2.to_csv('temp_out/Xs.csv', header=True)

In [None]:
# === Heatmap of feature correlation ===
Xs_pd = pd.DataFrame(Xs)
hm1 = sns.heatmap(Xs_pd.corr())
hm1.figure.savefig('temp_out/Xs.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === Conduct PCA and display updated heatmap ===
pca = PCA(n_components=50,random_state=np.random.seed(0))
pca.fit(Xs_pd)
Xr = pca.transform(Xs_pd)
Xr_pd = pd.DataFrame(Xr)
hm2 = sns.heatmap(Xr_pd.corr())
hm2.figure.savefig('temp_out/Xr.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === Define creation of artificial neural network ===
def create_ann(d,l):
    model = Sequential()
    model.add(tf.keras.Input(shape=(l,)))
    model.add(Dropout(d,input_shape=(l,)))
    model.add(Dense(50,activation='relu',name='hl_2',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(25,activation='relu',name='hl_3',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(1,activation='linear',name='l_o',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))
    return model

In [None]:
# === Cross validation function ===
def qsar_cv(X,y,n,m,p1,p2,p3):

    acc, pre, rec, f1s, auc = (np.zeros(n) for i in range(5))
    t_sta = time.perf_counter()
    
    for i in range(n):
        if m == 'LR':
            model = LogisticRegression(solver=p1,random_state=np.random.seed(i),max_iter=200)
        if m == 'SVM':
            model = svm.SVC(kernel=p1,random_state=np.random.seed(i),probability=True)
        if m == 'RF':
            model = RandomForestClassifier(n_estimators=p1,max_depth=p2,random_state=np.random.seed(i))
        if m == 'GBT':
            model = XGBClassifier(max_depth=p1,seed=i)
        prob = cross_val_predict(model, X, y, method='predict_proba')[:,1]
        pred = cross_val_predict(model, X, y, method='predict')
    
        acc[i] = met.accuracy_score(y,pred)
        pre[i] = met.precision_score(y,pred)
        rec[i] = met.recall_score(y,pred)
        f1s[i] = met.f1_score(y,pred)
        
        auc[i] = met.roc_auc_score(y,prob,average='micro')
        prc = met.precision_recall_curve(y,prob)
        roc = met.roc_curve(y,prob)

    t_end = time.perf_counter()
    t_ela = t_end-t_sta

    m_acc = np.mean(acc)
    m_pre = np.mean(pre)
    m_rec = np.mean(rec)
    m_f1s = np.mean(f1s)
    m_auc = np.mean(roc)
    
    ret = {'Model':m, 'Param_1':p1, 'Param_2':p2, 'Param_3':p3, 'Accuracy':m_acc, 'Precision':m_pre, 'Recall':m_rec, 'F1_Score':m_f1s, 'ROC_AUC':m_auc, 'Time':t_ela}
    return ret, prc, roc

In [None]:
# === Artificial neural network cross validation ===
def qsar_ann_cv(X, y, nspl, nsed, p1, p2, p3):

    # === Accumulate accuracy ===
    acc, pre, rec, f1s, auc = (np.zeros(nspl*nsed) for i in range(5))
    i = 0

    # === K-fold cross validation ===
    t_sta = time.perf_counter()
    kfold = KFold(n_splits=nspl, shuffle=True, random_state=42)
    for rs in range(nsed):
        tf.keras.utils.set_random_seed(rs)
        for trn_i, tst_i in kfold.split(X):
    
            # === Split data ===
            X_trn, X_tst = X[trn_i], X[tst_i]
            y_trn, y_tst = y[trn_i], y[tst_i]

            # === Create and train model ===
            model = create_ann(p2,np.shape(X)[1])
            model.fit(X_trn, y_trn, epochs=p1, verbose=0)

            # === Make predictions ===
            yh_tst = model.predict(X_tst)
            
            auc[i] = met.roc_auc_score(y_tst,yh_tst,average='micro')
            prc = met.precision_recall_curve(y_tst,yh_tst)
            roc = met.roc_curve(y_tst,yh_tst)
            
            yh_tst = (yh_tst >= 0.5).astype(int)
        
            acc[i] = met.accuracy_score(y_tst,yh_tst)
            pre[i] = met.precision_score(y_tst,yh_tst)
            rec[i] = met.recall_score(y_tst,yh_tst)
            f1s[i] = met.f1_score(y_tst,yh_tst)
            auc[i] = met.roc_auc_score(y_tst,yh_tst)

            i += 1
        print(f'Completed seed {rs}.')

    t_end = time.perf_counter()
    t_ela = t_end-t_sta
    
    m_acc = np.mean(acc)
    m_pre = np.mean(pre)
    m_rec = np.mean(rec)
    m_f1s = np.mean(f1s)
    m_auc = np.mean(auc)
    
    ret = {'Model':'ANN', 'Param_1':p1, 'Param_2':p2, 'Param_3':p3, 'Accuracy':m_acc, 'Precision':m_pre, 'Recall':m_rec, 'F1_Score':m_f1s, 'ROC_AUC':m_auc, 'Time':t_ela}
    return ret,prc,roc

In [None]:
# === Initialize metrics dataframe ===
metrics = pd.DataFrame(columns=['Model', 'Param_1', 'Param_2', 'Param_3', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC', 'Time', 'Data'])
prc_m = pd.DataFrame(columns=['P','R','M'])
roc_m = pd.DataFrame(columns=['F','T','M'])

# === PRC and ROC processing function ===
def prc_roc(prc,roc,lab):
    prc2 = pd.DataFrame(prc[0:2]).transpose()
    prc2.columns = ['P','R']
    prc2['M'] = [lab]*len(prc[0])
    roc2 = pd.DataFrame(roc[0:2]).transpose()
    roc2.columns = ['F','T']
    roc2['M'] = [lab]*len(roc[0])
    return prc2,roc2

In [None]:
# === Logistic regression and support vector machines ===
p1l = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
ll1 = len(p1l)
for i in range(ll1):
    os,prc,roc = qsar_cv(Xs,yr,100,'LR',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'LR.S.'+p1l[i])
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
p1l = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
ll1 = len(p1l)
for i in range(ll1):
    os,prc,roc = qsar_cv(Xr,yr,100,'LR',p1l[i],0,0)
    os['Data']='Reduced'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'LR.R.'+p1l[i])
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
print('LR complete')
    
p1l = ['linear', 'poly', 'rbf']
ll1 = len(p1l)
for i in range(ll1):
    os,prc,roc = qsar_cv(Xs,yr,100,'SVM',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'SVM.S.'+p1l[i])
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
p1l = ['linear', 'poly', 'rbf', 'sigmoid']
ll1 = len(p1l)
for i in range(ll1):
    os,prc,roc = qsar_cv(Xr,yr,100,'SVM',p1l[i],0,0)
    os['Data']='Reduced'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'SVM.R.'+p1l[i])
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
print('SVM complete')

metrics.to_csv('RS_MULTI_MET.csv', mode='a', index=False, header=False)
#prc_m.to_csv('RS_MULTI_PRC.csv', mode='a', index=False, header=False)
#roc_m.to_csv('RS_MULTI_ROC.csv', mode='a', index=False, header=False)

In [None]:
# === Random forest ===
p1l = [50, 100, 300]
p2l = [5, 10]
ll1 = len(p1l)
ll2 = len(p2l)
for i in range(ll1):
    for j in range(ll2):
        os,prc,roc = qsar_cv(Xs,yr,100,'RF',p1l[i],p2l[j],0)
        os['Data']='Scaled'
        os = pd.DataFrame(os,index=[i*ll2+j])
        metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
        prc2,roc2 = prc_roc(prc,roc,'RF.S.'+str(p1l[i])+'.'+str(p2l[j]))
        prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
        roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
        print(i*ll2+j)
for i in range(ll1):
    for j in range(ll2):
        os,prc,roc = qsar_cv(Xr,yr,100,'RF',p1l[i],p2l[j],0)
        os['Data']='Reduced'
        os = pd.DataFrame(os,index=[i*ll2+j])
        metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
        prc2,roc2 = prc_roc(prc,roc,'RF.R.'+str(p1l[i])+'.'+str(p2l[j]))
        prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
        roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
        print(i*ll2+j)
print('RF complete')

metrics.to_csv('RS_MULTI_MET.csv', mode='a', index=False, header=False)
#prc_m.to_csv('RS_MULTI_PRC.csv', mode='a', index=False, header=False)
#roc_m.to_csv('RS_MULTI_ROC.csv', mode='a', index=False, header=False)

In [None]:
# === Gradient boosted trees ===
p1l = [3, 5, 10]
ll1 = len(p1l)
for i in range(ll1):
    os,prc,roc = qsar_cv(Xs,yr,100,'GBT',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'GBT.S.'+str(p1l[i]))
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
for i in range(ll1):
    os,prc,roc = qsar_cv(Xr,yr,100,'GBT',p1l[i],0,0)
    os['Data']='Reduced'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'GBT.R.'+str(p1l[i]))
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
    print(p1l[i])
print('GBT complete')

metrics.to_csv('RS_MULTI_MET.csv', mode='a', index=False, header=False)
#prc_m.to_csv('RS_MULTI_PRC.csv', mode='a', index=False, header=False)
#roc_m.to_csv('RS_MULTI_ROC.csv', mode='a', index=False, header=False)

In [None]:
# === Artificial neural networks ===
p1l = [100, 200, 400]
ll1 = len(p1l)
for i in range(ll1,):
    os,prc,roc = qsar_ann_cv(Xs,y,5,10,p1l[i],0.2,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'ANN.S.0.2.'+str(p1l[i]))
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
p2l = [0, 0.1, 0.2]
ll2 = len(p2l)
for i in range(ll2):
    os,prc,roc = qsar_ann_cv(Xs,y,5,10,400,p2l[i],0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'ANN.R.'+str(p2l[i])+'.400')
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
metrics.to_csv('RS_MULTI_MET.csv', mode='a', index=False, header=False)
#prc_m.to_csv('RS_MULTI_PRC.csv', mode='a', index=False, header=False)
#roc_m.to_csv('RS_MULTI_ROC.csv', mode='a', index=False, header=False)

In [None]:
# === Artificial neural networks ===
p1l = [100, 200, 400]
ll1 = len(p1l)
for i in range(ll1,):
    os,prc,roc = qsar_ann_cv(Xr,y,5,10,p1l[i],0.2,0)
    os['Data']='Reduced'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'ANN.R.0.2.'+str(p1l[i]))
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
p2l = [0, 0.1, 0.2]
ll2 = len(p2l)
for i in range(ll2):
    os,prc,roc = qsar_ann_cv(Xr,y,5,10,400,p2l[i],0)
    os['Data']='Reduced'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
    prc2,roc2 = prc_roc(prc,roc,'ANN.R.'+str(p2l[i])+'.400')
    prc_m = pd.concat([prc_m,prc2],axis=0,ignore_index=True)
    roc_m = pd.concat([roc_m,roc2],axis=0,ignore_index=True)
metrics.to_csv('RS_MULTI_MET.csv', mode='a', index=False, header=False)
prc_m.to_csv('RS_MULTI_PRC.csv', mode='a', index=False, header=False)
roc_m.to_csv('RS_MULTI_ROC.csv', mode='a', index=False, header=False)