## Single-task DNN keras model training on Tox21

This notebook creates a single-task DNN classification model to predict toxicity (toxic / nontoxic) on the RTECS dataset, using 
 - same splits as pytorch MTDNN and STDNN 
 - same architecture as the pytorch STDNN on RTECS
 - FP (Morgan fingerprints) as input
 
A separate DNN is created for each of the 12 endpoints (tasks) in Tox21. 
 
This is the trained model used within the CEM explanations scripts (cem/cem_explanations). Explanations on the prediction of this model is computed by the CEM. 

The model is trained on seed 122, matching the seed used for CEM explanations. 

Our goal on the accuracy of the model here, is to construct a keras model with the same architecture as the pytorch STDNN that can be explained by the CEM. 

#### Import statements

In [None]:
import os
import sys
from keras.models import model_from_json
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from keras.callbacks import ModelCheckpoint
from rdkit.Chem.Draw import IPythonConsole

In [None]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

with HiddenPrints():
    print("This will not be printed")

print("HiddenPrints() successful if nothing printed before this line.")

##### This notebook experiments on performing classification prediction on Tox21 data, using the data from ngramgraph paper

In [None]:
# general and data handling
import numpy as np
import pandas as pd
import os
from collections import Counter

# Required RDKit modules
import rdkit as rd
from rdkit import DataStructs
from rdkit.Chem import AllChem
import rdkit.Chem.MCS

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

# modeling
import sklearn as sk
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Graphing
import matplotlib.pyplot as plt

import torch
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
from torch.utils.data import Dataset, DataLoader
import time
import random
import joblib


##### Settings

In [None]:
# set seed value
seed_value = 122 #122 123 124, as used in MoleculeNet
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

In [None]:
# number of bits for morgan fingerprints
morgan_bits = 4096

In [None]:
# number of radius for morgan fingerprints
morgan_radius = 2

In [None]:
batch = 512

In [None]:
train_epoch = 50

##### Load data

In [None]:
# Load raw data 
# tox21_tasks defines each of the 12 endpoints in Tox21

tox21_file  = '../../../data/datasets/tox21/raw_data/tox21.csv'
tox21_tasks = ['NR-AR', 'NR-Aromatase', 'NR-PPAR-gamma', 'SR-HSE', 
               'NR-AR-LBD', 'NR-ER', 'SR-ARE', 'SR-MMP',
               'NR-AhR', 'NR-ER-LBD', 'SR-ATAD5', 'SR-p53']

tox21_data = pd.read_csv(tox21_file)
print('Reading {}... {} data loaded.'.format(tox21_file, len(tox21_data)))

In [None]:
data = [tox21_data]

In [None]:
all_tasks = tox21_tasks

##### Load split data and compute FP (Morgan fingerprints)

In [None]:
# load saved tox21 train/test/valid data 
data_path = "../../../data/datasets/tox21/split_data/seed_122/"
train_data=torch.load(data_path + 'train_data_tox21.pth')
test_data=torch.load(data_path + 'test_data_tox21.pth')
valid_data=torch.load(data_path + 'valid_data_tox21.pth')

data = [train_data, test_data, valid_data]

In [None]:
%%time
# construct morgan fingerprints 
for i in range(len(data)):
    data[i]['mol'] = [rd.Chem.MolFromSmiles(x) for x in data[i]['smiles']]

    bi = [{} for _ in range(len(data[i]))]
    data[i]['morgan'] = [AllChem.GetMorganFingerprintAsBitVect(data[i].iloc[j]['mol'], morgan_radius, nBits = morgan_bits, bitInfo=bi[j]) 
                         for j in range(len(data[i]))]
    data[i]['bitInfo'] = bi


##### Create training and test set

In [None]:
# replace NA with -1 -- used to deal with missing labels, 
#                       along with Binary Cross-Entropy loss

data[0] = data[0].fillna(-1)
data[1] = data[1].fillna(-1)
data[2] = data[2].fillna(-1)

train_data = data[0]
test_data  = data[1]
valid_data = data[2]

In [None]:
## Create arrays for train / test / valid sets used for DNN 

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_train = []
for fp in train_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_train.append(arr)
x_train = np.array(x_train)

y_train = train_data[all_tasks].astype('int').values

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_test = []
for fp in test_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_test.append(arr)
x_test = np.array(x_test)

y_test = test_data[all_tasks].astype('int').values

In [None]:
# convert the RDKit explicit vectors into numpy arrays
x_valid = []
for fp in valid_data['morgan']:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    x_valid.append(arr)
x_valid = np.array(x_valid)

y_valid = valid_data[all_tasks].astype('int').values

##### Deep Neural Network

In [None]:
import tensorflow as tf
import keras

from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from keras.models import Model, Sequential
from keras.layers import LeakyReLU

from keras.utils import to_categorical

In [None]:
input_shape = morgan_bits
input_shape

In [None]:
# Separate models are created for each of the 12 endpoints (task)

input_fp = Input(shape=(input_shape,))

task_output = [None for task in all_tasks]
for i in range(len(all_tasks)):    
    hidden_task = Dense(1024)(input_fp)
    hidden_task = BatchNormalization()(hidden_task)
    hidden_task = LeakyReLU(alpha=0.05)(hidden_task)
    
    hidden_task = Dense(512)(hidden_task)
    hidden_task = BatchNormalization()(hidden_task)
    hidden_task = LeakyReLU(alpha=0.05)(hidden_task)

    task_output[i] = Dense(2, activation='softmax', name=all_tasks[i])(hidden_task)

deepnn = Model(input_fp, task_output)

In [None]:
deepnn.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
# change the format of y_train (train set labels) to match model
y_train_nn = [to_categorical(y_train[:,i]) for i in range(len(all_tasks))]

In [None]:
# change the format of y_train (valid set labels) to match model
y_valid_nn = [to_categorical(y_valid[:,i]) for i in range(len(all_tasks))]

In [None]:
import os
# specify path of saved trained model
filepath= "results/checkpoint.hdf5"#"path/checkpoint.hdf5"
os.makedirs(os.path.dirname(filepath), exist_ok=True)

# saves model with the lowest validation loss
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, 
                             mode='min',  period=1) 

In [None]:
%%time
# train model 
history = deepnn.fit(x_train, y_train_nn,
                epochs = train_epoch,
                batch_size = batch,
                shuffle = True,
                validation_data=(x_valid, y_valid_nn),
                callbacks = [checkpoint],
                verbose=1
               )

##### Load trained model from file

In [None]:
# best by minimum valid loss
deepnn.load_weights(filepath)

##### See Test set performance


In [None]:
y_test_pred = MTDNN.predict(x_test)

In [None]:
i = 0
valid_datapoints = y_test[:,i] >= 0
y_test_task = y_test[valid_datapoints,i] 
y_test_pred_task = y_test_pred[i][valid_datapoints,1]

acc = accuracy_score(y_test_task, np.round(y_test_pred_task))
print('Accuracy for DNN on Morgan Fingerprint:', acc)

In [None]:
### Computes: auc, true positive (tp), true negative (tn), false positive (fp), false negative (fn)
### For each model predicting a task within the 12 possible tasks for Tox21 

results = {}
# Test AUC
for i in range(len(all_tasks)):
    
    valid_datapoints = y_test[:,i] >= 0
    y_test_task = y_test[valid_datapoints,i] 
    y_test_pred_task = y_test_pred[i][valid_datapoints,1]
    
    acc = accuracy_score(y_test_task, np.round(y_test_pred_task))
    print('Accuracy for MTDNN on Morgan Fingerprint:', acc)
    
    bacc = sk.metrics.balanced_accuracy_score(y_test_task, np.round(y_test_pred_task))

    f1 = f1_score(y_test_task, np.round(y_test_pred_task), pos_label=1)
    print('F1 for MTDNN on Morgan Fingerprint:', f1)

    cfm = sk.metrics.confusion_matrix(y_test_task, np.round(y_test_pred_task))#, normalize='true')
#     print('Confusion Matrix for deepnn on Morgan Fingerprint:\n', cfm)
    cfm = cfm.astype('float') / cfm.sum(axis=1)[:, np.newaxis]

    tn, fp, fn, tp = cfm.ravel()
    pr = tp / (tp + fp)
    rc = tp / (tp + fn)
    print(' True Positive:', tp)
    print(' True Negative:', tn)
    print('False Positive:', fp)
    print('False Negative:', fn)
    
    
    auc = roc_auc_score(y_test_task, y_test_pred_task)
    print('Test ROC AUC ({}):'.format(all_tasks[i]), auc)
    
    results[all_tasks[i]] = [auc, acc, bacc, tn, tp, pr, rc, f1]

    fpr, tpr, threshold = sk.metrics.roc_curve(y_test_task, y_test_pred_task)
    plt.plot(fpr, tpr, 'b', label = 'AUC')
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
### Returns performance metrics for each model prediciting a specific task within the 12 possible tasks for Tox21 

print('Task'.ljust(10), '\t', '  AUC ', ' ACC ', ' BACC ', ' TN  ', ' TP  ', ' PR  ', ' RC  ', ' F1  ')
for task, auc in results.items():
    print(task.ljust(10), '\t', np.round(auc,3))