In [1]:
# === Basic libraries ===
import numpy as np
import pandas as pd
import time

# === Feature generation ===
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn import preprocessing as pp

# === Classifiers ===
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, cv, DMatrix

# === Metrics and cross-validation ====
from sklearn import metrics as met
from sklearn.model_selection import cross_val_predict, LeaveOneOut, KFold

# === Neural networks ===
import tensorflow as tf
from keras import Model, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.data import Iterator

In [2]:
# === Load 200 descriptors and calculator ===
chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed']
mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors)

In [3]:
# === Read in files with X,Y ===
LSI_XY = pd.read_csv('LSI_XY.csv')
clist_trn = list(LSI_XY['SMILES'])
y = np.array(LSI_XY['Inhib'])
y = np.reshape(y,(-1,1))
y = y.ravel()
X = np.zeros(shape=(len(clist_trn),200))

# === Calculate descriptors ===
for i in range(len(clist_trn)):
    mol = Chem.MolFromSmiles(clist_trn[i])
    X[i,:] = mol_descriptor_calculator.CalcDescriptors(mol)
scaler = pp.MinMaxScaler().fit(X)
Xs = scaler.transform(X)

In [4]:
# === Define creation of artificial neural network ===
def create_ann():
    model = Sequential()
    model.add(tf.keras.Input(shape=(200,)))
    model.add(Dropout(.2,input_shape=(200,)))
    model.add(Dense(50,activation='relu',name='hl_2',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(25,activation='relu',name='hl_3',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(1,activation='linear',name='l_o',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))
    return model

In [5]:
# === Cross validation function ===
def qsar_cv(X,y,n,m,p1,p2,p3):

    acc, pre, rec, f1s, roc = (np.zeros(n) for i in range(5))
    t_sta = time.perf_counter()
    
    for i in range(n):
        if m == 'LogReg':
            model = LogisticRegression(solver=p1,random_state=np.random.seed(i))
        if m == 'SVM':
            model = svm.SVC(kernel=p1,random_state=np.random.seed(i))
        if m == 'RF':
            model = RandomForestClassifier(n_estimators=p1,max_depth=p2,random_state=np.random.seed(i))
        if m == 'GBT':
            model = XGBClassifier(max_depth=p1,seed=i)
        #prob = cross_val_predict(model, X, y, method='predict_proba')[:,1]
        pred = cross_val_predict(model, X, y, method='predict')
    
        acc[i] = met.accuracy_score(y,pred)
        pre[i] = met.precision_score(y,pred)
        rec[i] = met.recall_score(y,pred)
        f1s[i] = met.f1_score(y,pred)
        roc[i] = met.roc_auc_score(y,pred)
    
        #prc = met.precision_recall_curve(y,prob)

    t_end = time.perf_counter()
    t_ela = t_end-t_sta

    m_acc = np.mean(acc)
    m_pre = np.mean(pre)
    m_rec = np.mean(rec)
    m_f1s = np.mean(f1s)
    m_roc = np.mean(roc)
    
    ret = {'Model':m, 'Param_1':p1, 'Param_2':p2, 'Param_3':p3, 'Accuracy':m_acc, 'Precision':m_pre, 'Recall':m_rec, 'F1_Score':m_f1s, 'ROC_AUC':m_roc, 'Time':t_ela}
    return ret

In [6]:
metrics = pd.DataFrame(columns=['Model', 'Param_1', 'Param_2', 'Param_3', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC', 'Time', 'Data'])

In [None]:
p1l = ['lbfgs', 'liblinear' , 'newton-cg']
ll1 = len(p1l)
for i in range(ll1):
    os = qsar_cv(Xs,y,100,'LogReg',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
print('LR complete')
    
p1l = ['linear', 'poly' , 'rbf', 'sigmoid']
ll1 = len(p1l)
for i in range(ll1):
    os = qsar_cv(Xs,y,100,'SVM',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
print('SVM complete')

metrics.to_csv('LSI_MULTI_MET.csv', mode='a', index=False, header=False)

In [None]:
p1l = [50, 100, 300]
p2l = [5, 10]
ll1 = len(p1l)
ll2 = len(p2l)
for i in range(ll1):
    for j in range(ll2):
        os = qsar_cv(Xs,y,100,'RF',p1l[i],p2l[j],0)
        os['Data']='Scaled'
        os = pd.DataFrame(os,index=[i*ll2+j])
        metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
        print(i*ll2+j)
print('RF complete')

metrics.to_csv('LSI_MULTI_MET.csv', mode='a', index=False, header=False)

In [None]:
p1l = [3, 5, 10]
ll1 = len(p1l)
for i in range(ll1):
    os = qsar_cv(Xs,y,100,'GBT',p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[i])
    metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
print('GBT complete')

metrics.to_csv('LSI_MULTI_MET.csv', mode='a', index=False, header=False)

In [7]:
def qsar_ann_cv(X, y, nspl, nsed, p1, p2, p3):

    # === Accumulate accuracy ===
    acc, pre, rec, f1s, roc = (np.zeros(nspl*nsed) for i in range(5))
    i = 0

    # === K-fold cross validation ===
    t_sta = time.perf_counter()
    kfold = KFold(n_splits=nspl, shuffle=True, random_state=42)
    for rs in range(nsed):
        tf.keras.utils.set_random_seed(rs)
        for trn_i, tst_i in kfold.split(X):
    
            # === Split data ===
            X_trn, X_tst = X[trn_i], X[tst_i]
            y_trn, y_tst = y[trn_i], y[tst_i]

            # === Create and train model ===
            model = create_ann()
            model.fit(X_trn, y_trn, epochs=p1, verbose=0)

            # === Make predictions ===
            yh_tst = model.predict(X_tst)
            yh_tst = (yh_tst >= 0.5).astype(int)
        
            acc[i] = met.accuracy_score(y_tst,yh_tst)
            pre[i] = met.precision_score(y_tst,yh_tst)
            rec[i] = met.recall_score(y_tst,yh_tst)
            f1s[i] = met.f1_score(y_tst,yh_tst)
            roc[i] = met.roc_auc_score(y_tst,yh_tst)

            i += 1
        print(f'Completed seed {rs}.')

    t_end = time.perf_counter()
    t_ela = t_end-t_sta
    
    m_acc = np.mean(acc)
    m_pre = np.mean(pre)
    m_rec = np.mean(rec)
    m_f1s = np.mean(f1s)
    m_roc = np.mean(roc)
    
    ret = {'Model':'ANN', 'Param_1':p1, 'Param_2':p2, 'Param_3':p3, 'Accuracy':m_acc, 'Precision':m_pre, 'Recall':m_rec, 'F1_Score':m_f1s, 'ROC_AUC':m_roc, 'Time':t_ela}
    return ret

In [9]:
p1l = [100, 200, 400]
ll1 = len(p1l)
for i in range(ll1):
    os = qsar_ann_cv(Xs,y,5,10,p1l[i],0,0)
    os['Data']='Scaled'
    os = pd.DataFrame(os,index=[0])
metrics = pd.concat([metrics,os],axis=0,ignore_index=True)
metrics.to_csv('LSI_MULTI_MET.csv', mode='a', index=False, header=False)

Completed seed 0.
Completed seed 1.
Completed seed 2.


  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 3.


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 4.


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 5.


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 6.


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 7.


  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 8.


  _warn_prf(average, modifier, msg_start, len(result))


Completed seed 9.
Completed seed 0.
Completed seed 1.
Completed seed 2.
Completed seed 3.
Completed seed 4.
Completed seed 5.
Completed seed 6.
Completed seed 7.
Completed seed 8.
Completed seed 9.
Completed seed 0.
Completed seed 1.
Completed seed 2.
Completed seed 3.
Completed seed 4.
Completed seed 5.
Completed seed 6.
Completed seed 7.
Completed seed 8.
Completed seed 9.


PermissionError: [Errno 13] Permission denied: 'LSI_MULTI_MET.csv'