## Single-task DNN keras model training on RTECS

This notebook creates a single-task DNN classification model to predict toxicity (toxic / nontoxic) on the RTECS dataset, using 
 - same splits as pytorch MTDNN and STDNN 
 - same architecture as the pytorch STDNN on RTECS
 - FP (Morgan fingerprints) as input
 
This is the trained model used within the CEM explanations scripts (cem/cem_explanations). Explanations on the prediction of this model is computed by the CEM. 

The model is trained on seed 122, matching the seed used for CEM explanations. 

Our goal on the accuracy of the model here, is to construct a keras model with the same architecture as the pytorch STDNN that can be explained by the CEM. 

#### Import statements

In [None]:
import os
import sys
from keras.models import model_from_json
from matplotlib import pyplot as plt
import numpy as np

from keras.callbacks import ModelCheckpoint
from rdkit.Chem.Draw import IPythonConsole

In [None]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

with HiddenPrints():
    print("This will not be printed")

print("This will be printed as before")

In [None]:
# general and data handling
import numpy as np
import pandas as pd
import os
from collections import Counter

# Required RDKit modules
import rdkit as rd
from rdkit import DataStructs
from rdkit.Chem import AllChem
import rdkit.Chem.MCS

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

# modeling
import sklearn as sk
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Graphing
import matplotlib.pyplot as plt

# To set seed and device
import torch
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
from torch.utils.data import Dataset, DataLoader
import time
import random
import joblib


##### Settings

In [None]:
# number of bits for morgan fingerprints
morgan_bits = 4096

In [None]:
# number of radius for morgan fingerprints
morgan_radius = 2

In [None]:
# set set value 

seed_value = 122 #122 123 124, as used in MoleculeNet
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

In [None]:
# Load raw data

a_oral_file = # cannot provide 
a_oral_data = pd.read_csv(a_oral_file)
a_oral_data = a_oral_data.sort_values(by='year', ascending=1) # 
a_oral_data.head()

In [None]:
# Set task , toxic - 1, nontoxic - 0
# Based on LD50 values 

a_oral_tasks = ['toxic_a_oral'] 

print("Acute oral tasks: %s" % str(a_oral_tasks))
print("%d tasks in total" % len(a_oral_tasks))

In [None]:
task = a_oral_tasks[0]

In [None]:
task_label = a_oral_tasks[0]

In [None]:
# set raw input data 
data = [a_oral_data]

In [None]:
all_tasks = a_oral_tasks

##### Load split data and compute FP (Morgan fingerprints)

In [None]:
# load saved tox21 train/test/valid data 
data_path = # cannot provide 
train_data=torch.load(data_path + 'train_data_rtecs.pth')
test_data=torch.load(data_path + 'test_data_rtecs.pth')
valid_data=torch.load(data_path + 'valid_data_rtecs.pth')

In [None]:
data = [train_data, test_data, valid_data]

In [None]:
%%time
# construct morgan fingerprints 
for i in range(len(data)):
    data[i]['mol'] = [rd.Chem.MolFromSmiles(x) for x in data[i]['SMILES']]

    bi = [{} for _ in range(len(data[i]))]
    data[i]['morgan'] = [AllChem.GetMorganFingerprintAsBitVect(data[i].iloc[j]['mol'], morgan_radius, nBits = morgan_bits, bitInfo=bi[j]) 
                         for j in range(len(data[i]))]
    data[i]['bitInfo'] = bi


##### Create training and test set

In [None]:
# replace NA with -1  -- used to deal with missing labels,
#                        along with Binary Cross-Entropy loss 

data[0] = data[0].fillna(-1)
data[1] = data[1].fillna(-1)
data[2] = data[2].fillna(-1)

train_data = data[0]
test_data  = data[1]
valid_data = data[2]

In [None]:
## Create arrays for train / test / valid sets used for DNN 

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_train = []
for fp in train_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_train.append(arr)
x_train = np.array(x_train)

y_train = train_data[task].values

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_test = []
for fp in test_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_test.append(arr)
x_test = np.array(x_test)

y_test = test_data[task].values

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_valid = []
for fp in valid_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_valid.append(arr)
x_valid = np.array(x_valid)

y_valid = valid_data[task].values

##### Deep Neural Network

In [None]:
import keras

from keras.layers import Input, Dense, Activation, LeakyReLU
from keras.models import Model, Sequential

from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [None]:
input_shape = x_train.shape[1]
input_shape

In [None]:
deepnn = Sequential([
    Dense(2048, input_shape=(input_shape,)),
    Activation('relu'),
    Dense(1024),
    Activation('relu'),
    Dense(512),
    Activation('relu'),
    Dense(2),
    Activation('softmax'),
])

In [None]:
deepnn.compile(optimizer='adam', loss='binary_crossentropy')

##### Train neural network

In [None]:
# use one hot encoding to the set the labels for the train / valid
# sets in the neural net

y_train_nn = to_categorical(y_train)

y_valid_nn = to_categorical(y_valid)

In [None]:
import os
# specify path of saved trained model
filepath= #"path/checkpoint.hdf5"
os.makedirs(os.path.dirname(filepath), exist_ok=True)

# saves model with the lowest validation loss
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, 
                             mode='min',  period=1) 

In [None]:
%%time
# train deepnn
history = deepnn.fit(x_train, y_train_nn,
                epochs= 50, 
                batch_size= 512, 
                shuffle=True,
                validation_data=(x_valid, y_valid_nn), 
                callbacks = [checkpoint],
                verbose=1
               )

##### Load trained model from file

In [None]:
# best by minimum valid loss
deepnn.load_weights(filepath)

##### See Test set performance


In [None]:
# Predict on the test set
y_test_pred = deepnn.predict(x_test)[:,1]

In [None]:
auc = roc_auc_score(y_test, y_test_pred)
print('Test ROC AUC:', auc)

fpr, tpr, threshold = sk.metrics.roc_curve(y_test, y_test_pred)
plt.plot(fpr, tpr, 'b', label = 'AUC')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
acc = accuracy_score(y_test, np.round(y_test_pred))
print('Accuracy for deepnn on Morgan Fingerprint:', acc)

In [None]:
f1 = f1_score(y_test, np.round(y_test_pred), pos_label=1)
print('F1 for deepnn on Morgan Fingerprint:', f1)

In [None]:
# Confusion matrix
cfm = sk.metrics.confusion_matrix(y_test, np.round(y_test_pred))#, normalize='true')
cfm = cfm.astype('float') / cfm.sum(axis=1)[:, np.newaxis]

In [None]:
tn, fp, fn, tp = cfm.ravel()
print(' True Positive:', tp)
print(' True Negative:', tn)
print('False Positive:', fp)
print('False Negative:', fn)