In [1]:
# === Basic libraries ===
import numpy as np
import pandas as pd

# === Molecular descriptor calculation ===
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

# === Cross validation and model evaluation ===
from sklearn import preprocessing as pp
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import accuracy_score

# === Neural network construction and training ===
import tensorflow as tf
import keras
from keras import Model, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.data import Iterator

In [2]:
# === Load 200 descriptors and calculator ===
chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed']
mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors)

In [3]:
# === Read in files with X,Y ===
CDS1_XY = pd.read_csv('Documents/CDS1_XY2.csv')
clist = list(CDS1_XY['SMILES'])
y = np.array(CDS1_XY['Inhib'])
y = np.reshape(y,(-1,1))
X = np.zeros(shape=(len(clist),200))

# === Calculate descriptors ===
for i in range(len(clist)):
    mol = Chem.MolFromSmiles(clist[i])
    X[i,:] = mol_descriptor_calculator.CalcDescriptors(mol)
scaler = pp.StandardScaler().fit(X)
Xs = scaler.transform(X)

In [8]:
# === Set up cross validation with training and testing datasets ===
def make_dataset(Xdat,ydat,n_spl):
    def gen():
        for trn_i, tst_i in KFold(n_spl).split(Xdat):
            X_trn, X_tst = Xdat[trn_i], Xdat[tst_i]
            y_trn, y_tst = ydat[trn_i], ydat[tst_i]
            yield X_trn, y_trn, X_tst, y_tst
            
    return tf.data.Dataset.from_generator(gen, (tf.float64,tf.float64,tf.float64,tf.float64))
dataset=make_dataset(Xs,y,5)

# === Define model ===
def create_model():
    model = Sequential()
    model.add(tf.keras.Input(shape=(200,)))
    model.add(Dropout(.2,input_shape=(200,)))
    model.add(Dense(50,activation='relu',name='hl_2',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(25,activation='relu',name='hl_3',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.add(Dense(1,activation='linear',name='l_o',kernel_regularizer=tf.keras.regularizers.L1(0)))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))
    return model

In [9]:
# === Set up number of splits and random seeds ===
nspl = 5
nsed = 10

# === Accumulate accuracy ===
acc_trn = np.zeros(nspl*nsed)
acc_tst = np.zeros(nspl*nsed)
act = 0

# === K-fold cross validation ===
kfold = KFold(n_splits=nspl, shuffle=True, random_state=42)
for rs in range(nsed):
    tf.keras.utils.set_random_seed(rs)
    for trn_i, tst_i in kfold.split(Xs):
    
        # === Split data ===
        X_trn, X_tst = X[trn_i], X[tst_i]
        y_trn, y_tst = y[trn_i], y[tst_i]

        # === Create and train model ===
        model = create_model()
        model.fit(X_trn, y_trn, epochs=800, verbose=0)

        # === Make predictions ===
        yh_trn = model.predict(X_trn)
        yh_tst = model.predict(X_tst)
        yh_trn = (yh_trn >= 0.5).astype(int)
        yh_tst = (yh_tst >= 0.5).astype(int)
        yc_trn = tf.constant(y_trn)
        yc_tst = tf.constant(y_tst)

        # === Calculate accuracy ===
        acc_trn[act] = accuracy_score(y_trn, yh_trn)
        acc_tst[act] = accuracy_score(y_tst, yh_tst)
        act += 1
    print(f'Completed seed {rs}.')

# === Report results ===
m_acc_trn = np.mean(acc_trn)
m_acc_tst = np.mean(acc_tst)
print(f'Average Training Accuracy: {m_acc_trn}')
print(f'Average Cross-Val Accuracy: {m_acc_tst}')

Completed seed 0.
Completed seed 1.
Completed seed 2.
Completed seed 3.
Completed seed 4.
Completed seed 5.
Completed seed 6.
Completed seed 7.
Completed seed 8.
Completed seed 9.
Average Training Accuracy: 0.5665546218487395
Average Cross-Val Accuracy: 0.5563888888888888
