# **Examples of Toxicity Prediction**

In this section examples of toxicity prediction will be provided.

**Install RD-Kit.**

RD-Kit is an open-source Python package for data scientists to work with chemistry data.

In [None]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


**Install the other required packages.**

In [None]:
! pip install py3Dmol

Collecting py3Dmol
  Downloading py3Dmol-2.1.0-py2.py3-none-any.whl (12 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.1.0


In [None]:
! pip install mol2vec gensim

Collecting mol2vec
  Downloading mol2vec-0.2.2-py3-none-any.whl (15 kB)
Collecting jedi>=0.16 (from IPython->mol2vec)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, mol2vec
Successfully installed jedi-0.19.1 mol2vec-0.2.2


In [None]:
! pip install xgboost
! pip install lightgbm



**Import necessary libraries and modules.**

In [None]:
import pandas as pd
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import tensorflow as tf
import warnings
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
import py3Dmol
from ipywidgets import interact,fixed,IntSlider
import ipywidgets
from rdkit.Chem import Crippen
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem import MolFromSmiles
from mol2vec.features import mol2alt_sentence, MolSentence
import gensim
import pickle
from mol2vec.features import mol2sentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
from gensim.models import Word2Vec
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from sklearn.pipeline import Pipeline

**Load models**

In [None]:
save_dir = "/content/drive/My Drive/saved_models/model_6"

models_6 = {}
for col in target_columns:
    model_path = os.path.join(save_dir, f"model_6_{col}.pkl")
    if os.path.exists(model_path):
        model_loaded = joblib.load(model_path)
        models_6[col] = model_loaded
        print(f"Model for {col} loaded successfully.")
    else:
        print(f"Model file for {col} does not exist.")

Model for SR-HSE loaded successfully.
Model for NR-AR loaded successfully.
Model for SR-ARE loaded successfully.
Model for NR-Aromatase loaded successfully.
Model for NR-ER-LBD loaded successfully.
Model for NR-AhR loaded successfully.
Model for SR-MMP loaded successfully.
Model for NR-ER loaded successfully.
Model for NR-PPAR-gamma loaded successfully.
Model for SR-p53 loaded successfully.
Model for SR-ATAD5 loaded successfully.
Model for NR-AR-LBD loaded successfully.


*3D visualization*

In [None]:
def show3D_molecule(idx, style):
    """
    Show molecule in 3D
    """
    mblock = Chem.MolToMolBlock(df['ROMol'].iloc[idx])
    viewer = py3Dmol.view(width=300, height=300)
    viewer.addModel(mblock, 'ROMol')
    viewer.setStyle({style:{}})
    viewer.rotate(45, "y", animationDuration=1)

    viewer.zoomTo()

    print(f"SMILES notation: {df['SMILES'].iloc[idx]}\nRotate me!");

    return viewer.show()

interact(show3D_molecule,
         idx=ipywidgets.IntSlider(min=0,max=len(df["ROMol"])-1,
                                  step=1, value=3064,
                                  description="Molecule"),
         style=ipywidgets.Dropdown(options=['line', 'stick', 'sphere'],
                                   value='stick',
                                   description='Style:'));

interactive(children=(IntSlider(value=3064, description='Molecule', max=11763), Dropdown(description='Style:',…

**Plot random examples with their predicted and actual toxicity.**

In [None]:
def plot_example(models, X_test, y_test, df, idx=0, show_3d=True):
    feature_columns = df.columns.difference(['Formula', 'FW', 'DSSTox_CID', 'ID', 'ROMol', 'SMILES', 'mol2vec', 'sentence'] + target_columns)
    X_test_df = pd.DataFrame(X_test, columns=feature_columns)

    # Reseting the index of X_test_df
    X_test_df.reset_index(drop=True, inplace=True)

    for col, model in models.items():
        prediction = np.squeeze(model.predict(np.expand_dims(X_test_df.iloc[idx], 0))[0])

        # Filtering df to drop NaN values for the current assay column
        filtered_df = df.dropna(subset=[col])

        # Reseting the index of filtered_df
        filtered_df.reset_index(drop=True, inplace=True)

        # Getting the target value for the current sample
        target = filtered_df.loc[X_test_df.index[idx], col]

        if show_3d:
            show3D_molecule(X_test_df.index[idx], "stick")
        else:
            display(df.loc[X_test_df.index[idx], "ROMol"])

        smiles = df.loc[X_test_df.index[idx], "SMILES"]

        print(f"SMILES notation: {smiles}")
        print(f"Target {col} = {target}, Predictions {col} = {prediction}")
        print("="*50)

In [None]:
indicies = np.random.choice(range(len(X_test)), 1)
for idx in indicies:
  plot_example(models_6, X_test, y_test, df, idx=idx, show_3d=True)

SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target SR-HSE = 0, Predictions SR-HSE = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-AR = 0, Predictions NR-AR = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target SR-ARE = 0, Predictions SR-ARE = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-Aromatase = 0, Predictions NR-Aromatase = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-ER-LBD = 0, Predictions NR-ER-LBD = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-AhR = 0, Predictions NR-AhR = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target SR-MMP = 0, Predictions SR-MMP = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-ER = 0, Predictions NR-ER = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-PPAR-gamma = 0, Predictions NR-PPAR-gamma = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target SR-p53 = 1, Predictions SR-p53 = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target SR-ATAD5 = 0, Predictions SR-ATAD5 = 0
SMILES notation: CC(C)(CO)[C@@H](O)C(=O)NCCCO
Rotate me!


SMILES notation: CN(C(=O)CN(CCO)CC(=O)N(C)C(C)(C)Cc1ccccc1)C(C)(C)Cc1ccccc1
Target NR-AR-LBD = 0, Predictions NR-AR-LBD = 0


**Example of Toxicity Predictions**

*Preprocess.*

In [None]:
# Defining feature columns used during training

def sentences2vec(sentences, model_2vec, unseen=None):
    keys = set(model_2vec.wv.key_to_index)
    vec = []
    for sentence in sentences:
        this_vec = []
        for word in sentence:
            if word in keys:
                this_vec.append(model_2vec.wv[word])
            elif unseen:
                this_vec.append(model_2vec.wv[unseen])
        if this_vec:
            vec.append(np.mean(this_vec, axis=0))
        else:
            vec.append(np.zeros(model_2vec.vector_size))
    return vec


def preprocess_smiles(smiles, model):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Calculating descriptors
    descriptors = [
        Descriptors.MolLogP(mol),
        Descriptors.MolWt(mol),
        Descriptors.TPSA(mol),
        Descriptors.HeavyAtomCount(mol),
        Descriptors.NumHeteroatoms(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol)
    ]

    # Converting to MolSentence
    sentence = MolSentence(mol2alt_sentence(mol, 1))

    # Converting sentence to vector using sentences2vec function
    vec = sentences2vec([sentence], model_2vec, unseen='UNK')[0]

    # Combining descriptors and mol2vec vector into a single feature array
    features = np.concatenate([descriptors, vec])

    return features

*Load models and make predictions.*

In [None]:
url = 'https://github.com/samoturk/mol2vec_notebooks/raw/master/Notebooks/model_300dim.pkl'
file_to_download = requests.get(url, allow_redirects=True)
open('model_300dim.pkl', 'wb').write(file_to_download.content)

model_2vec = Word2Vec.load('model_300dim.pkl')

target_columns = ['SR-HSE', 'NR-AR', 'SR-ARE', 'NR-Aromatase', 'NR-ER-LBD', 'NR-AhR', 'SR-MMP', 'NR-ER', 'NR-PPAR-gamma', 'SR-p53', 'SR-ATAD5', 'NR-AR-LBD']

save_dir = "/content/drive/My Drive/saved_models/model_6"

warnings.filterwarnings("ignore", message="X does not have valid feature names, but SVC was fitted with feature names")

models_6 = {}
for col in target_columns:
    model_path = os.path.join(save_dir, f"model_6_{col}.pkl")
    if os.path.exists(model_path):
        model_loaded = joblib.load(model_path)
        models_6[col] = model_loaded
        print(f"Model for {col} loaded successfully.")
    else:
        print(f"Model file for {col} does not exist.")

def predict_models(features, models):
    predictions = {}
    for col, model in models.items():
            prediction = model.predict(features)
            predictions[col] = prediction[0][0]
    return predictions

Model for SR-HSE loaded successfully.
Model for NR-AR loaded successfully.
Model for SR-ARE loaded successfully.
Model for NR-Aromatase loaded successfully.
Model for NR-ER-LBD loaded successfully.
Model for NR-AhR loaded successfully.
Model for SR-MMP loaded successfully.
Model for NR-ER loaded successfully.
Model for NR-PPAR-gamma loaded successfully.
Model for SR-p53 loaded successfully.
Model for SR-ATAD5 loaded successfully.
Model for NR-AR-LBD loaded successfully.


*Output.*

In [None]:
# Example SMILES string
smiles = 'C/C=C/C=C/C(O)=O'

# Preprocessing the SMILES string
features = preprocess_smiles(smiles, model)

if features is not None:
    # Ensuring the feature array has the correct shape
    features = features.reshape(1, -1)

    # Making predictions using all models
    predictions = {}
    for col in target_columns:
        model = models_6[col]
        if hasattr(model, 'predict_proba'):
            y_pred = model.predict_proba(features)
            prediction = y_pred[0][1]
        elif hasattr(model, 'decision_function'):
            decision = model.decision_function(features)
            # Converting decision function output to a probability-like score
            prediction = 1 / (1 + np.exp(-decision))[0]
        else:
            prediction = model.predict(features)[0]

        predictions[col] = prediction

    # Printing predictions
    for col, prediction in predictions.items():
        print(f"Prediction for {col}: {prediction:.4f}")
else:
    print("Invalid SMILES string.")

Prediction for SR-HSE: 0.2713
Prediction for NR-AR: 0.2246
Prediction for SR-ARE: 0.2517
Prediction for NR-Aromatase: 0.2002
Prediction for NR-ER-LBD: 0.1382
Prediction for NR-AhR: 0.2357
Prediction for SR-MMP: 0.1548
Prediction for NR-ER: 0.2056
Prediction for NR-PPAR-gamma: 0.2304
Prediction for SR-p53: 0.1620
Prediction for SR-ATAD5: 0.1574
Prediction for NR-AR-LBD: 0.2470
