<a href="https://colab.research.google.com/github/GalJakob/Toxicity-prediction-WS/blob/main/SMILES_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import io
from google.colab import files

dataset_name = "tox21" # change to cardio / tox21 / clintox
aug_type = 1 # 0 = non-augmented, 1 = only train aug


if aug_type == 1:
  ds_train_aug = dataset_name + "_train_aug.csv"
  ds_test = dataset_name + "_test.csv"
  data = files.upload()
  data1 = io.BytesIO(data[ds_train_aug])
  data2 = io.BytesIO(data[ds_test])

elif aug_type == 0:
  ds_train = dataset_name + "_train.csv"
  ds_test = dataset_name + "_test.csv"
  data = files.upload()
  data1 = io.BytesIO(data[ds_train])
  data2 = io.BytesIO(data[ds_test])

Saving tox21_test.csv to tox21_test.csv
Saving tox21_train_aug.csv to tox21_train_aug.csv


In [None]:
import pandas as pd
import os
import time
train_data = pd.read_csv(data1)
test_data = pd.read_csv(data2)

print(train_data.shape)
print(test_data.shape)

(10487, 2)
(1567, 2)


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# convert the smiles to sequences of tokens
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_data['smiles'])

train_sequences = tokenizer.texts_to_sequences(train_data['smiles'])
test_sequences = tokenizer.texts_to_sequences(test_data['smiles'])

mx = -1
for seq in train_sequences:
  mx = max(len(seq),mx)

print("mx=", mx)

# pad sequences to a fixed length
if dataset_name == 'clintox':
  max_sequence_length = 200
elif dataset_name == 'tox21':
  max_sequence_length = 150
else:
  max_sequence_length = 125

train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)


print(train_sequences.shape)

train_labels = train_data['label'].values
test_labels = test_data['label'].values

#train_sequences, val_sequences, train_labels, val_labels = train_test_split(train_sequences, train_labels, test_size=0.2, stratify=train_labels)

mx= 342
(10487, 200)


In [None]:
input_dim = len(tokenizer.word_index) + 1  # Add 1 for padding token
embedding_dim = 64
hidden_units = 256

model = keras.Sequential([
    layers.Embedding(input_dim, embedding_dim, input_length=max_sequence_length),
    layers.Bidirectional(layers.GRU(hidden_units, return_sequences=True)),
    layers.Bidirectional(layers.GRU(hidden_units, return_sequences=True)),
    layers.Bidirectional(layers.GRU(hidden_units)),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['AUC'])

In [None]:
class TestSetEvaluationCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 10 == 0:
            y_pred_probs = model.predict(test_sequences)
            y_pred = np.round(y_pred_probs)

            accuracy = accuracy_score(test_labels, y_pred)
            precision = precision_score(test_labels, y_pred)
            recall = recall_score(test_labels, y_pred)
            roc_auc = roc_auc_score(test_labels, y_pred_probs)
            pr_auc = average_precision_score(test_labels, y_pred_probs)

            print(f"Epoch {epoch+1} - Test Set Metrics:")
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("AUC-ROC:", roc_auc)
            print("AUC-PRC:", pr_auc)

test_evaluation_callback = TestSetEvaluationCallback()

In [None]:
# train
batch_size = 32
epochs = 40

model.fit(train_sequences, train_labels, batch_size=batch_size, epochs=epochs, callbacks=[test_evaluation_callback])


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 10 - Test Set Metrics:
Accuracy: 0.8519463943841736
Precision: 0.11731843575418995
Recall: 0.22105263157894736
AUC-ROC: 0.7174270594965675
AUC-PRC: 0.15211496064154134
Epoch 11/40
Epoch 12/40
Epoch 13/40

In [None]:
# evaluation

y_pred_probs = model.predict(test_sequences)
y_pred = np.round(y_pred_probs)

accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
roc_auc = roc_auc_score(test_labels, y_pred_probs)
pr_auc = average_precision_score(test_labels, y_pred_probs)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC-ROC:", roc_auc)
print("AUC-PRC:", pr_auc)


TP = 0
FP = 0
TN = 0
FN = 0

for i in range(len(y_pred)):
    if test_labels[i]==y_pred[i]==1:
        TP += 1
    if y_pred[i]==1 and test_labels[i]!=y_pred[i]:
        FP += 1
    if test_labels[i]==y_pred[i]==0:
        TN += 1
    if y_pred[i]==0 and test_labels[i]!=y_pred[i]:
        FN += 1

print(TP, FN)
print(FP, TN)

Accuracy: 0.9221442246330568
Precision: 0.3246753246753247
Recall: 0.2631578947368421
AUC-ROC: 0.7717534324942792
AUC-PRC: 0.25706789035624544
25 70
52 1420
