<a href="https://colab.research.google.com/github/Ishola-github/InfernoRDN/blob/master/TOX_AM_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Step 1: Install RDKit (for Google Colab)
!pip install rdkit-pypi

# Step 2: Import Libraries
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

# Step 3: Example Single SMILES (Aspirin)
smiles = "CC(=O)Oc1ccccc1C(=O)O"
mol = Chem.MolFromSmiles(smiles)

if mol is not None:
    # Generate ECFP4 fingerprints (2048 bits)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print(f"ECFP4 Fingerprint for {smiles} (first 50 bits): {list(fp.ToBitString())[:50]}")

    # Calculate common molecular descriptors
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    print(f"\nMolecular Weight: {mw:.2f}")
    print(f"LogP: {logp:.2f}")
    print(f"Hydrogen Bond Donors: {h_donors}")
    print(f"Hydrogen Bond Acceptors: {h_acceptors}")
else:
    print(f"Could not process SMILES: {smiles}")

# Step 4: Process a list of SMILES strings
smiles_list = [
    "CC(=O)Oc1ccccc1C(=O)O",  # Aspirin
    "CN1CCCC1C2=CN=CC=C2",    # Nicotine
    "CC(=O)NC1=CC=C(C=C1)O",  # Paracetamol
    "C1=CC=C2C(=C1)C=CC(=C2)O" # Naphthol
]

# Initialize lists
fingerprints = []
molecular_weights = []

# Generate features
for s in smiles_list:
    mol = Chem.MolFromSmiles(s)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        fingerprints.append(fp)
        molecular_weights.append(Descriptors.MolWt(mol))
    else:
        fingerprints.append(None)
        molecular_weights.append(None)

print(f"\nProcessed {len(smiles_list)} molecules.")
print(f"Molecular Weights: {molecular_weights}")


ECFP4 Fingerprint for CC(=O)Oc1ccccc1C(=O)O (first 50 bits): ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

Molecular Weight: 180.16
LogP: 1.31
Hydrogen Bond Donors: 1
Hydrogen Bond Acceptors: 3

Processed 4 molecules.
Molecular Weights: [180.15899999999996, 162.23600000000002, 151.165, 144.17299999999997]


In [24]:
import numpy as np
import pandas as pd

# Convert RDKit BitVect to numpy array
fp_array = [np.array(fp) if fp is not None else np.zeros(1024) for fp in fingerprints]

# Create DataFrame
df = pd.DataFrame(fp_array)
df['MolWt'] = molecular_weights

# Display
print("\nFeature matrix preview:")
print(df.head())



Feature matrix preview:
   0  1  2  3  4  5  6  7  8  9  ...  1015  1016  1017  1018  1019  1020  \
0  0  0  0  0  0  0  0  0  0  0  ...     0     0     1     0     0     0   
1  0  0  0  0  1  0  0  0  0  0  ...     0     0     0     0     1     0   
2  0  0  0  0  0  0  0  0  0  0  ...     0     0     1     0     0     0   
3  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

   1021  1022  1023    MolWt  
0     0     0     0  180.159  
1     0     0     0  162.236  
2     0     0     0  151.165  
3     0     0     0  144.173  

[4 rows x 1025 columns]


In [25]:
# Step-2, Import the required library
import deepchem as dc
import numpy as np
from deepchem.models import MultitaskClassifier
from deepchem.metrics import Metric
from deepchem.feat import CircularFingerprint

# Load dataset with ECFP fingerprints
tox21_tasks, datasets, transformers = dc.molnet.load_tox21(
    featurizer='ECFP', splitter='scaffold'
)
train_dataset, valid_dataset, test_dataset = datasets

# Define MultitaskClassifier using ECFP features
model = MultitaskClassifier(
    n_tasks=len(tox21_tasks),
    n_features=train_dataset.X.shape[1],  # Use feature vector shape
    layer_sizes=[1024, 512],  # two hidden layers
    dropouts=[0.25, 0.25]
)

# Train and evaluate
model.fit(train_dataset, nb_epoch=10)
metric = Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")
print("Valid scores:", model.evaluate(valid_dataset, [metric], transformers))


Valid scores: {'mean-roc_auc_score': np.float64(0.7079537341013312)}


In [26]:
# Step-3, Define metrics
from deepchem.metrics import Metric
from sklearn.metrics import roc_auc_score

metric = Metric(roc_auc_score, np.mean, mode="classification")

# Evaluate model
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

# Output results
print("Train scores:", train_scores)
print("Validation scores:", valid_scores)
print("Test scores:", test_scores)


Train scores: {'mean-roc_auc_score': np.float64(0.9416483907984454)}
Validation scores: {'mean-roc_auc_score': np.float64(0.7079537341013312)}
Test scores: {'mean-roc_auc_score': np.float64(0.6870289428490675)}


In [27]:
# Step-4, Install PyTorch in Google Colab (usually pre-installed or easily installed)
# !pip install torch torchvision torchaudio

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- Generate some synthetic data for demonstration ---
num_compounds = 1000
num_features = 200 # Represents molecular descriptors/fingerprints

X = np.random.rand(num_compounds, num_features).astype(np.float32)
y = np.random.randint(0, 2, num_compounds).astype(np.float32).reshape(-1, 1) # Ensure y is 2D

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the Neural Network model
class ToxicityPredictor(nn.Module):
    def __init__(self, input_size):
        super(ToxicityPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(128, 1) # Output layer for binary classification
        self.sigmoid = nn.Sigmoid() # Sigmoid for binary classification output

    def forward(self, x):
        x = self.dropout1(self.relu1(self.fc1(x)))
        x = self.dropout2(self.relu2(self.fc2(x)))
        x = self.sigmoid(self.fc3(x))
        return x

# Instantiate the model
model = ToxicityPredictor(num_features)

# Define loss function and optimizer
criterion = nn.BCELoss() # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 20
print("Training the PyTorch model...")
for epoch in range(num_epochs):
    model.train() # Set model to training mode
    for inputs, labels in train_loader:
        optimizer.zero_grad() # Zero the gradients
        outputs = model(inputs) # Forward pass
        loss = criterion(outputs, labels) # Calculate loss
        loss.backward() # Backward pass
        optimizer.step() # Update weights

    print(f"Epoch, Loss: {loss.item():.4f}")

# Evaluate the model
model.eval() # Set model to evaluation mode
with torch.no_grad(): # Disable gradient calculation during evaluation
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float() # Convert probabilities to binary
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"\nTest Accuracy: {accuracy:.4f}")

    # Make sample predictions
    sample_inputs = X_test_tensor
    sample_outputs = model(sample_inputs)
    sample_predictions_prob = sample_outputs.flatten().numpy()
    sample_predictions_binary = (sample_outputs > 0.5).int().flatten().numpy()

    print(f"\nSample inputs (first 5 features): {sample_inputs.numpy()}")
    print(f"Sample predictions (probabilities): {sample_predictions_prob}")
    print(f"Sample predictions (binary): {sample_predictions_binary}")
    print(f"True labels for sample: {y_test_tensor.flatten().numpy()}")


Training the PyTorch model...
Epoch, Loss: 0.6851
Epoch, Loss: 0.6317
Epoch, Loss: 0.5432
Epoch, Loss: 0.3144
Epoch, Loss: 0.3096
Epoch, Loss: 0.0934
Epoch, Loss: 0.1432
Epoch, Loss: 0.0608
Epoch, Loss: 0.0219
Epoch, Loss: 0.0343
Epoch, Loss: 0.1004
Epoch, Loss: 0.0452
Epoch, Loss: 0.0544
Epoch, Loss: 0.0059
Epoch, Loss: 0.0125
Epoch, Loss: 0.0108
Epoch, Loss: 0.0026
Epoch, Loss: 0.0042
Epoch, Loss: 0.0062
Epoch, Loss: 0.0336

Test Accuracy: 0.5150

Sample inputs (first 5 features): [[-0.7930219  -0.5234613  -0.05251822 ... -0.38024768 -0.14027023
  -1.0376816 ]
 [ 0.8457245  -0.37734002  0.0726507  ... -1.7467203   1.4429984
   1.6335025 ]
 [-0.9890757   0.7904778  -1.2014533  ... -0.39247754 -0.15060821
   1.0188866 ]
 ...
 [ 0.55854815 -1.2704556  -1.695796   ... -0.84608126 -1.2264547
  -0.98187596]
 [-1.8054985  -0.6797335   1.2470664  ... -0.19528732  0.4276243
  -1.4055729 ]
 [ 1.6152296   1.5446903   1.5264565  ...  1.3151361  -0.915636
  -1.3208405 ]]
Sample predictions (proba