<a href="https://colab.research.google.com/github/GalJakob/Toxicity-prediction-WS/blob/main/SMILES_MFP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import io
from google.colab import files
import pandas as pd
import os
import time

dataset_name = "cardio" # change to cardio / tox21 / clintox
is_train_aug = 1 # 0 = non-augmented, 1 = augmented


if is_train_aug == 1:
  ds_train_aug = dataset_name + "_train_aug"
  ds_test = dataset_name + "_test"
  path_train = f"https://raw.githubusercontent.com/GalJakob/Toxicity-prediction-WS/main/datasets/train%20datasets/{ds_train_aug}.csv"
  path_test = f"https://raw.githubusercontent.com/GalJakob/Toxicity-prediction-WS/main/datasets/test%20datasets/{ds_test}.csv"

  try: #getting data from github
    test_data = pd.read_csv(path_test)
    train_data = pd.read_csv(path_train)

  except: #uploading data instead from github
    data = files.upload()
    data1 = io.BytesIO(data[ds_train_aug])
    data2 = io.BytesIO(data[ds_test])
    train_data = pd.read_csv(data1)
    test_data = pd.read_csv(data2)

else:
  ds_train = dataset_name + "_train"
  ds_test = dataset_name + "_test"
  path_train = f"https://raw.githubusercontent.com/GalJakob/Toxicity-prediction-WS/main/datasets/train%20datasets/{ds_train}.csv"
  path_test = f"https://raw.githubusercontent.com/GalJakob/Toxicity-prediction-WS/main/datasets/test%20datasets/{ds_test}.csv"
  try: #getting data from github
    test_data = pd.read_csv(path_test)
    train_data = pd.read_csv(path_train)

  except: #uploading data instead from github
    data = files.upload()
    data1 = io.BytesIO(data[ds_train])
    data2 = io.BytesIO(data[ds_test])
    train_data = pd.read_csv(data1)
    test_data = pd.read_csv(data2)

In [None]:
!pip install rdkit

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import tensorflow as tf
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.losses import mean_absolute_error
from tensorflow.keras.layers import Dense, Input, Activation
from tensorflow.keras.layers import BatchNormalization, Add, Dropout
from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
from tensorflow.keras.optimizers import Adam, Adadelta, SGD

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve

np.random.seed(42)
tf.random.set_seed(42)
tf.keras.utils.set_random_seed(42)

In [None]:
# Preprocessing
def preprocess_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return mol

train_data['mol'] = train_data['smiles'].apply(preprocess_smiles).dropna()
test_data['mol'] = test_data['smiles'].apply(preprocess_smiles).dropna()

def generate_fingerprint(mol):
    if dataset_name == 'cardio':
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=1, nBits=1024, useFeatures=True, useChirality=True)
    elif dataset_name == 'clintox':
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048, useFeatures=True, useChirality=True)
    else: # tox21
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=1, nBits=2048, useFeatures=True, useChirality=True)

    return fingerprint


train_data['fingerprint'] = train_data['mol'].apply(generate_fingerprint)
test_data['fingerprint'] = test_data['mol'].apply(generate_fingerprint)

In [None]:
# Splitting Data
X_train = np.array(train_data['fingerprint'].tolist())
y_train = np.array(train_data['label'])
X_test = np.array(test_data['fingerprint'].tolist())
y_test = np.array(test_data['label'])

length = X_train.shape[1]

# Calculate class weights
from sklearn.utils import compute_class_weight
train_l = train_data['label']
cw = compute_class_weight(
    class_weight = "balanced",
    classes = np.unique(train_l),
    y = train_l
)
class_weights = dict(zip(np.unique(train_l), cw))

#DNN

In [None]:
def create_nn_model(input_shape):
    # input layer
    inp = Input(shape = (input_shape,))

    # first hidden layer
    x = Dense(256, kernel_initializer = 'he_normal')(inp)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # second hidden layer
    x = Dense(512, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # third hidden layer
    x = Dense(1024, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # fourth hidden layer
    x = Dense(512, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # fifth hidden layer
    x = Dense(256, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # sixth hidden layer
    x = Dense(128, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # seventh hidden layer
    x = Dense(64, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha = 0.05)(x)
    x = Dropout(0.2)(x)

    # output layer
    out = Dense(1, activation = 'sigmoid')(x)
    model = Model(inputs = inp, outputs = out)
    return model

In [None]:
nn_model = create_nn_model(length)
nn_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['AUC'])

nn_model.fit(X_train, y_train, epochs=40, batch_size=32,
           class_weight=class_weights)


In [None]:

y_pred_probs = nn_model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_probs)
prc_auc = average_precision_score(y_test, y_pred_probs)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC-ROC:", roc_auc)
print("PR-PRC:", prc_auc)

#XGB

In [None]:
if dataset_name == 'clintox':
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        max_depth=8,
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.6,
        colsample_bytree=0.7,
        random_state=42,
        reg_lambda=0,
        reg_alpha=0,
        min_child_weight=1,
        tree_method='gpu_hist',
        gpu_id=0
)

elif dataset_name == 'cardio':
    xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=5,
    learning_rate=0.3,
    n_estimators=1000,
    subsample=1.0,
    colsample_bytree=0.6,
    random_state=42,
    reg_lambda=1,
    reg_alpha=0,
    min_child_weight=1,
    tree_method='gpu_hist',
    gpu_id=0
)

else: # tox21
    xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=9,
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.6,
    colsample_bytree=0.6,
    random_state=42,
    reg_lambda=1,
    reg_alpha=0.1,
    min_child_weight=1,
    tree_method='gpu_hist',
    gpu_id=0,
)
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
prc_auc = average_precision_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC-ROC:", roc_auc)
print("PR-PRC:", prc_auc)

Accuracy: 0.8612903225806452
Precision: 0.6578947368421053
Recall: 0.746268656716418
AUC-ROC: 0.8196363859713778
PR-PRC: 0.5458049312013785
