### SMART Prototype Infrastructure

---

Feel free to test our model out by training and checking the model accuracy

In [1]:
#Written by Muhammad-Tameem Mughal with assistance from Darien Schettler

import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

import sklearn
from sklearn.model_selection import train_test_split
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
import sys

In [2]:
import pandas as pd
tox_df = pd.read_csv('/kaggle/input/tcfinalit2/tcfinalit2.csv')
tox_df.head()

Unnamed: 0,smiles,SR-HSE,NR-AR,SR-ARE,NR-Aromatase,NR-ER-LBD,NR-AhR,SR-MMP,NR-ER,NR-PPAR-gamma,SR-p53,SR-ATAD5,NR-AR-LBD,Toxic
0,C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N...,0,2,2,2,2,2,2,2,2,2,2,2,0
1,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,0,2,2,2,2,2,2,2,2,2,2,2,0
2,CO[C@H]1CC(O[C@H]2C[C@H]([C@H]3O[C@](C)(O)[C@H...,0,2,2,2,2,2,2,2,2,2,2,2,0
3,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,1,2,2,2,2,2,2,2,2,2,2,2,0
4,CC(=O)O.CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=...,2,0,2,2,2,2,2,2,2,2,2,2,0


In [3]:
print(tox_df.columns)

Index(['smiles', 'SR-HSE', 'NR-AR', 'SR-ARE', 'NR-Aromatase', 'NR-ER-LBD',
       'NR-AhR', 'SR-MMP', 'NR-ER', 'NR-PPAR-gamma', 'SR-p53', 'SR-ATAD5',
       'NR-AR-LBD', 'Toxic'],
      dtype='object')


In [4]:
# Smiles representation and chemical tests
SMILES_COL = ["smiles",]
FEATURE_COLS = ['SR-HSE', 'NR-AR', 'SR-ARE', 'NR-Aromatase', 'NR-ER-LBD', 'NR-AhR',
                'SR-MMP', 'NR-ER', 'NR-PPAR-gamma', 'SR-p53', 'SR-ATAD5', 'NR-AR-LBD']
LABEL_COLS = ['Toxic',]

**We want to convert the chemical text data to one hot representations and then stack those representations into a single array per example**

One hot encoding is simply encoding a "sparse" value as a "dense" array.
* For a test with 3 possible values [0,1,2] we have the following possible one-hot 'values'

```
  * 0 --> [1, 0, 0]
  * 1 --> [0, 1, 0]
  * 2 --> [0, 0, 1]
```

* Now consider that we have 12 tests... and each have 3 possible sparse values. We can represent a single *example's* worth of tests with a 12x3 matrix or a array of length 36.
  * Each test can be represented w/ a 1x3 matrix (the one-hot representation)
  * Then we just stack them

In [5]:
display(pd.DataFrame(data=["dodge", "toyota", "mazda"], columns=["car_type"]))
pd.get_dummies(pd.DataFrame(data=["dodge", "toyota", "mazda"], columns=["car_type"]))

Unnamed: 0,car_type
0,dodge
1,toyota
2,mazda


Unnamed: 0,car_type_dodge,car_type_mazda,car_type_toyota
0,1,0,0
1,0,0,1
2,0,1,0


In [6]:
tox_df_features = pd.get_dummies(tox_df[FEATURE_COLS].astype(str))
test_shape = tox_df_features.shape[-1]

In [7]:
def flatten_l_o_l(nested_list):
    """ Flatten a list of lists """
    return [item for sublist in nested_list for item in sublist]

In [8]:
# Figure stuff out
tox_df["smile_chars"] = tox_df["smiles"].apply(list)
tox_df["smile_len"] = tox_df["smiles"].apply(len)
max_smile_len = tox_df["smile_len"].max()
tox_charlist = tox_df["smile_chars"].to_list()
all_smile_chars = flatten_l_o_l(tox_charlist)
smile_char_counts = pd.Series(all_smile_chars).value_counts()
possible_smile_chars = list(smile_char_counts.keys())
n_smile_elems = len(possible_smile_chars)+1
smile_char_map_c2i = {_c:i+1 for i,_c in enumerate(possible_smile_chars)}
smile_char_map_c2i.update({"<PAD>":0})
smile_char_map_i2c = {i+1:_c for i,_c in enumerate(possible_smile_chars)}
smile_char_map_i2c.update({0:"<PAD>"})

In [9]:
def encode_smiles(smile_chars, max_len, pad_token="<PAD>", pad_int=0):
    smile_encoding = [smile_char_map_c2i[_c] for _c in smile_chars]
    smile_encoding = smile_encoding+[pad_int,]*max_len
    return smile_encoding[:max_len]

tox_df["smile_encoding"] = tox_df["smile_chars"].apply(lambda x: encode_smiles(smile_chars=x, max_len=max_smile_len))

### Our Model

---

Inputs:
* Smile Information
* Test Information
    * Vector of length 36 (12 tests one hot encoded)

Outputs:
* Toxicity

In [10]:
def get_model(test_shape, n_vocab, smiles_seq_len, n_test_dense=64, n_test_smiles=64, dropout=0.25):
    
    # Path 1
    test_input = tf.keras.layers.Input(shape=(test_shape,))
    test_output = tf.keras.layers.Dense(n_test_dense, activation="relu")(test_input)
    
    
    # Path 2
    smiles_input = tf.keras.layers.Input(shape=(smiles_seq_len,))
    smiles_output = tf.keras.layers.Embedding(n_vocab, n_test_smiles, input_length=smiles_seq_len, mask_zero=True, )(smiles_input)
    smiles_output = tf.keras.layers.GlobalAveragePooling1D()(smiles_output)

    combined_output = tf.keras.layers.Concatenate()([test_output, smiles_output])
    combined_output = tf.keras.layers.Dropout(dropout)(combined_output)
    
    _inputs = [smiles_input, test_input]
    _outputs = tf.keras.layers.Dense(1, activation="sigmoid")(combined_output)
    
    return tf.keras.Model(inputs=_inputs, outputs=_outputs)

model = get_model(test_shape=test_shape, n_vocab=n_smile_elems, smiles_seq_len=max_smile_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 342)]        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 342, 64)      3584        input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 64)           2368        input_1[0][0]                    
______________________________________________________________________________________________

2022-05-28 03:53:22.650826: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [11]:
OPTIMIZER = "adam"
LOSS = "binary_crossentropy"
METRICS = "acc"
model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=[METRICS]) # tf.keras.metrics.AUC()

In [12]:
# Define number of validation examples
n_val = 1758
# n_val = 10000
val_indices = np.array(random.sample(range(len(tox_df)), n_val))
train_indices = np.array([x for x in range(len(tox_df)) if x not in val_indices])

# Get the training data as numpy arrays - (11758, N)
train_x_test = tox_df_features.to_numpy().copy()
# train_x_test[:, :16] = 0
train_x_smiles = np.array(tox_df.smile_encoding.to_list())
train_y = tox_df.Toxic.to_numpy()

# Take the first M training examples to use for validation 
val_x_test = train_x_test[val_indices]
val_x_smiles = train_x_smiles[val_indices]
val_y = train_y[val_indices]
N_VAL = len(val_y)

# Take the remaining training examples to use for training
train_x_test = train_x_test[train_indices]
train_x_smiles = train_x_smiles[train_indices]
train_y = train_y[train_indices]
N_TRAIN = len(train_y)

In [13]:
BATCH_SIZE = 64
N_EPOCHS = 10 

# class_weight
history = model.fit(x=(train_x_smiles, train_x_test), y=train_y, validation_data=((val_x_smiles, val_x_test), val_y), batch_size=BATCH_SIZE, epochs=N_EPOCHS)

2022-05-28 03:53:23.250186: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
model.save('smartmodel.h5')