In [1]:
from mol2vec import features
from mol2vec import helpers
import os
import pickle
import numpy as np
import pandas as pd
import sys

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec



In [5]:
model = word2vec.Word2Vec.load('model_300dim.pkl')

df = PandasTools.LoadSDF('structures.sdf', idName='ID', molColName='ROMol', includeFingerprints=False, isomericSmiles=True, smilesName=None, embedProps=False, removeHs=True, strictParsing=True)

RDKit ERROR: [21:44:28] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [21:44:28] ERROR: Could not sanitize molecule ending on line 21598
RDKit ERROR: [21:44:28] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [21:44:29] Explicit valence for atom # 13 Cl, 5, is greater than permitted
RDKit ERROR: [21:44:29] ERROR: Could not sanitize molecule ending on line 287346
RDKit ERROR: [21:44:29] ERROR: Explicit valence for atom # 13 Cl, 5, is greater than permitted
RDKit ERROR: [21:44:29] Explicit valence for atom # 39 N, 5, is greater than permitted
RDKit ERROR: [21:44:29] ERROR: Could not sanitize molecule ending on line 326226
RDKit ERROR: [21:44:29] ERROR: Explicit valence for atom # 39 N, 5, is greater than permitted
RDKit ERROR: [21:44:29] Explicit valence for atom # 70 O, 3, is greater than permitted
RDKit ERROR: [21:44:29] ERROR: Could not sanitize molecule ending on line 397138
RDKit ERROR: [21:44:29] ERROR: Explicit valence

In [6]:
df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')]

In [20]:
X_pre = np.array([x.vec for x in df['mol2vec']])

In [13]:
#set directory
os.getcwd()
os.chdir("/Users/varun/MEGA/Python/Data")

#list of desired DDI Types
desired_DDI = [0, 1, 2, 3, 4, 5, 6, 7, 15, 16, 17, 18, 19, 20, 21, 22, 26, 28, 30, 31, 32, 38, 40, 41, 43, 44, 45,
               49, 50, 51, 52, 54, 55, 62, 67, 68, 72, 74, 76, 78, 79, 80, 81]

ddidata = pd.read_excel("DrugBank_known_ddi.xlsx")
interactiondict = pd.read_csv("interaction_information.csv")
safe_drugs = pd.read_csv("safe_drug_combos.csv")
drug_similarity_feature = pd.read_csv("drug_similarity.csv")
drug_similarity = drug_similarity_feature.iloc[:, 1:len(drug_similarity_feature)+1]
#filter ddidata for desired DDI types
up_ddidata = ddidata[ddidata.Label.isin(desired_DDI)]
new_ddidata = up_ddidata.copy()
#convert types to int
new_ddidata.drug1 = up_ddidata.drug1.str[2:].astype(int)
new_ddidata.drug2 = up_ddidata.drug2.str[2:].astype(int)
new_ddidata.Label = up_ddidata.Label

In [14]:
#Incorporate safe_drugs into new_ddidata with DDIType 0
safe_drugs["Label"] = 0

frames = [safe_drugs, new_ddidata]
ddi_df = pd.concat(frames)

#create a DB to index dictionary from similarity dataset
DB_to_index = {}
i = 0
for col in drug_similarity.columns:
    DB_to_index[int(col[2:7])] = i
    i = i + 1

#filter output to only include DBs with similarity features
ddi_df_output = ddi_df[ddi_df.drug1.isin(DB_to_index)]
ddi_output = ddi_df_output[ddi_df_output.drug2.isin(DB_to_index)]
ddi_df_output.tail()

Unnamed: 0,drug1,drug2,Label
190256,655,461,81
190257,1041,367,81
190258,1395,302,81
190259,302,1357,81
190260,480,655,81


In [15]:
count = {}
#initialize dict
for i in range(0, 82):
    count[i] = 0
    
#count number of examples for each DDItype
for i in range(0, 82):
    if i in desired_DDI:
        count[i] = len(ddi_output[ddi_output.Label == (i)])

In [21]:
#create a structures matrix (SMILES)  for each drug-drug pair
i = 0
X = np.zeros((len(ddi_output), 600))
for index, row in ddi_output.iterrows():
        drug1_index = DB_to_index[row["drug1"]]
        drug2_index = DB_to_index[row["drug2"]]
        X[drug1_index,0:299] = X_pre[drug1_index, 0:299]
        X[drug2_index, 300:599] = X_pre[drug2_index, 0:299]

In [25]:
import tensorflow as tf
import sklearn.model_selection
#Create input and output vectors for training

y_data = np.array(ddi_output.Label)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_data, test_size = 0.4
                                                                           , random_state =1)
X_test, X_val, y_test, y_val = sklearn.model_selection.train_test_split(X_test, y_test, test_size = 0.5
                                                                       , random_state = 1)

train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_set = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))

BATCH_SIZE = 256
SHUFFLE_BUFFER_SIZE = 1024

train_set = train_set.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_set = test_set.batch(BATCH_SIZE)

In [36]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(5000, activation = 'relu', input_dim = 600),
    tf.keras.layers.Dense(5000, activation = 'relu'),
    tf.keras.layers.Dense(5000, activation = 'relu', input_dim = 600),
    tf.keras.layers.Dense(5000, activation = 'relu', input_dim = 600),
    tf.keras.layers.Dense(82, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 1e-5),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
model.fit(train_set, epochs=50)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100