In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Basic libraries
!pip install --quiet numpy==1.24.3 pandas scikit-learn

# Chemistry and drug discovery libraries
!pip install --quiet rdkit-pypi deepchem==2.8.0

# PyTorch and GNN libraries
!pip install --quiet torch torchvision torchaudio
!pip install --quiet torch-geometric dgl pytorch-lightning dm-haiku

# TensorFlow-compatible Keras
!pip install --quiet tf-keras

# Remove unnecessary or conflicting packages
!pip uninstall -y keras tensorflow keras-nightly keras-preprocessing

# Set environment variable to suppress TensorFlow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Upgrade core dependencies (this may still cause conflicts)
!pip install -U numpy jax jaxlib tensorflow torch torchdata torch-geometric

# Clean up old/conflicting installations first
!pip uninstall -y torch torchvision torchaudio torchdata torch-geometric
!pip uninstall -y jax jaxlib

# Install compatible versions of PyTorch and JAX
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 torchdata==0.6.1
!pip install jax==0.4.25 jaxlib==0.4.25

# Install PyTorch Geometric and dependencies (for CUDA 12.1 or CPU fallback)
!pip install torch-scatter torch-sparse torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

# Reinstall required libraries (final cleanup and ensure proper versions)
!pip install deepchem==2.8.0 rdkit-pypi numpy==1.24.3 pandas scikit-learn torch tf-keras pytorch-lightning dm-haiku

# Fix NumPy and JAX compatibility
!pip install "numpy<2.0" --force-reinstall
!pip install --upgrade "jax[cpu]" "jaxlib"

# (Optional) Reinstall PyTorch if needed
!pip install torch torchvision torchaudio

# Reinstall PyTorch Geometric packages again to make sure they're not broken
!pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cu121.html


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.
blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.
albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is inc

Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dgl 2.1.0 requires torchdata>=0.5.0, which is not

In [None]:
def write_script(path, content):
    with open(path, "w") as f:
        f.write(content)

In [None]:
# data_prep.py
data_prep_code = """
import os
import warnings
import deepchem as dc

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")

def load_tox21():
    # Use circular fingerprint featurization
    featurizer = dc.feat.CircularFingerprint(radius=2, size=1024)

    # Load the Tox21 dataset with a random split
    tox21_tasks, datasets, transformers = dc.molnet.load_tox21(
        featurizer=featurizer,
        splitter='random'
    )

    train_dataset, valid_dataset, test_dataset = datasets
    return tox21_tasks, train_dataset, valid_dataset, test_dataset

if __name__ == "__main__":
    tasks, train, valid, test = load_tox21()
    print(f"Train samples: {len(train)}")
    print(f"Validation samples: {len(valid)}")
    print(f"Test samples: {len(test)}")
"""


In [None]:
# Create the directory
!mkdir -p "/content/drive/MyDrive/drug_discovery_ml/src"

# Save the data preparation script to the specified path
write_script("/content/drive/MyDrive/drug_discovery_ml/src/data_prep.py", data_prep_code)


In [None]:
# train_baseline.py
train_baseline_code = """
import os
import warnings
import deepchem as dc
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")

def train_rf_model():
    print("Loading data...")
    # Load the Tox21 dataset using ECFP (CircularFingerprint)
    tox21_tasks, datasets, _ = dc.molnet.load_tox21(
        featurizer='ECFP',   # CORRECT FEATURIZER
        splitter='random',
        reload=False
    )

    train_dataset, valid_dataset, test_dataset = datasets

    # Select a sample task (e.g., SR-HSE)
    task_index = tox21_tasks.index("SR-HSE")
    X_train, y_train = train_dataset.X, train_dataset.y[:, task_index]
    X_valid, y_valid = valid_dataset.X, valid_dataset.y[:, task_index]

    # Remove NaNs
    mask_train = ~np.isnan(y_train)
    mask_valid = ~np.isnan(y_valid)

    X_train, y_train = X_train[mask_train], y_train[mask_train]
    X_valid, y_valid = X_valid[mask_valid], y_valid[mask_valid]

    print(f"Number of training samples: {len(X_train)}")
    print(f"Number of validation samples: {len(X_valid)}")

    # Random Forest model
    print("Training model...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Calculate AUC score
    y_pred = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)
    print(f"Validation AUC: {auc:.3f}")

    # Save results
    results_df = pd.DataFrame({
        "y_true": y_valid,
        "y_pred": y_pred
    })

    results_path = "/content/drive/MyDrive/drug_discovery_ml/results/sr_hse_rf.csv"
    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    results_df.to_csv(results_path, index=False)
    print(f"Results saved: {results_path}")

if __name__ == "__main__":
    train_rf_model()
"""


In [None]:
!mkdir -p "/content/drive/MyDrive/drug_discovery_ml/results"
write_script("/content/drive/MyDrive/drug_discovery_ml/src/train_baseline.py", train_baseline_code)

In [None]:
# molecule_analysis.py
mol_analysis_code = """
import deepchem as dc
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np

def extract_descriptors(smiles_list):
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    descriptors = []

    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            desc_values = []
            for name in descriptor_names:
                try:
                    desc_func = getattr(Descriptors, name)
                    desc_values.append(desc_func(mol))
                except:
                    desc_values.append(np.nan)
            descriptors.append(desc_values)
        else:
            descriptors.append([np.nan] * len(descriptor_names))

    df = pd.DataFrame(descriptors, columns=descriptor_names)
    return df

def main():
    tox21_tasks, datasets, _ = dc.molnet.load_tox21(featurizer='ECFP')
    train_dataset, _, _ = datasets
    smiles = train_dataset.ids

    print(f"Extracting descriptors from {len(smiles)} molecules...")
    df = extract_descriptors(smiles)
    df["smiles"] = smiles

    output_path = "/content/drive/MyDrive/drug_discovery_ml/results/tox21_descriptors.csv"
    df.to_csv(output_path, index=False)
    print(f"Descriptor dataset saved to {output_path}")

if __name__ == "__main__":
    main()
"""

In [None]:
write_script("/content/drive/MyDrive/drug_discovery_ml/src/molecule_analysis.py", mol_analysis_code)

In [None]:
print("Running data preparation...")
!python /content/drive/MyDrive/drug_discovery_ml/src/data_prep.py

print("\\nAnalyzing molecular descriptors...")
!python /content/drive/MyDrive/drug_discovery_ml/src/molecule_analysis.py

print("\\nTraining Random Forest baseline...")
!python /content/drive/MyDrive/drug_discovery_ml/src/train_baseline.py

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[14:29:49] Explicit valence for atom # 4 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[14:29:50] Explicit valence for atom # 9 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[14:29:50] Explicit valence for atom # 5 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIso

In [None]:
deep_model_code = """
import os
import warnings
import deepchem as dc
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import pandas as pd

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")

def train_deep_model():
    print("Loading data...")
    tox21_tasks, datasets, _ = dc.molnet.load_tox21(
        featurizer='ECFP',
        splitter='random',
        reload=False
    )

    train_dataset, valid_dataset, test_dataset = datasets

    # Index for the SR-HSE task
    task_index = tox21_tasks.index("SR-HSE")
    X_train, y_train = train_dataset.X, train_dataset.y[:, task_index]
    X_valid, y_valid = valid_dataset.X, valid_dataset.y[:, task_index]

    # Remove NaNs
    mask_train = ~np.isnan(y_train)
    mask_valid = ~np.isnan(y_valid)

    X_train, y_train = X_train[mask_train], y_train[mask_train]
    X_valid, y_valid = X_valid[mask_valid], y_valid[mask_valid]

    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_valid.shape}")

    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC()])

    # Train the model
    print("Training model...")
    model.fit(X_train, y_train, validation_data=(X_valid, y_valid),
              epochs=20, batch_size=64, verbose=1)

    # Predict and calculate AUC
    y_pred = model.predict(X_valid).flatten()
    auc = roc_auc_score(y_valid, y_pred)
    print(f"Validation AUC: {auc:.3f}")

    # Save the results
    results_df = pd.DataFrame({
        "y_true": y_valid,
        "y_pred": y_pred
    })

    results_path = "/content/drive/MyDrive/drug_discovery_ml/results/sr_hse_deep.csv"
    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    results_df.to_csv(results_path, index=False)
    print(f"Results saved to: {results_path}")

if __name__ == "__main__":
    train_deep_model()
"""


In [None]:
write_script("/content/drive/MyDrive/drug_discovery_ml/src/train_deep_model.py", deep_model_code)

In [None]:
print("\\nDeep learning model training...")
!python /content/drive/MyDrive/drug_discovery_ml/src/train_deep_model.py

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[14:30:29] Explicit valence for atom # 4 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[14:30:31] Explicit valence for atom # 9 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[14:30:31] Explicit valence for atom # 5 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIso

In [None]:
gnn_model_code = """
import os
import warnings
import deepchem as dc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")

# 1. GAT-based GNN model
class GAT(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GAT, self).__init__()
        self.gat1 = GATConv(num_node_features, 64, heads=4, concat=True, dropout=0.2)
        self.gat2 = GATConv(64 * 4, 128, heads=1, concat=True, dropout=0.2)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(128, 1)

    def forward(self, x, edge_index, batch):
        x = F.elu(self.gat1(x, edge_index))
        x = F.elu(self.gat2(x, edge_index))
        x = self.dropout(x)
        x = global_mean_pool(x, batch)  # Pooling per molecule
        return torch.sigmoid(self.linear(x)).view(-1)

# 2. SMILES → Graph conversion
def mol_to_graph(mol, label):
    from rdkit import Chem
    from rdkit.Chem import rdmolops

    if mol is None or mol.GetNumAtoms() == 0:
        return None

    node_feats = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
    node_feats = torch.tensor(node_feats, dtype=torch.float)

    edges = rdmolops.GetAdjacencyMatrix(mol)
    edge_index = torch.tensor(np.array(edges).nonzero(), dtype=torch.long)

    return Data(x=node_feats, edge_index=edge_index, y=torch.tensor([label], dtype=torch.float))

# 3. Prepare data
def load_graph_data(task_name="SR-HSE"):
    print("Loading data...")
    tox21_tasks, datasets, _ = dc.molnet.load_tox21(featurizer="Raw", splitter="random")
    train_dataset, valid_dataset, _ = datasets
    task_index = tox21_tasks.index(task_name)

    from rdkit import Chem
    train_graphs, valid_graphs = [], []

    for dataset, container in [(train_dataset, train_graphs), (valid_dataset, valid_graphs)]:
        for i in range(len(dataset)):
            smi = dataset.ids[i]
            label = dataset.y[i][task_index]
            if np.isnan(label):
                continue
            mol = Chem.MolFromSmiles(smi)
            graph = mol_to_graph(mol, label)
            if graph is not None:
                container.append(graph)

    return train_graphs, valid_graphs

# 4. Training
def train():
    train_graphs, valid_graphs = load_graph_data()

    train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
    valid_loader = DataLoader(valid_graphs, batch_size=64)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GAT(num_node_features=1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
    loss_fn = nn.BCELoss()

    print("Training started...")
    model.train()
    for epoch in range(1, 21):
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index, batch.batch)
            loss = loss_fn(out, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch:02d} | Loss: {total_loss:.4f}")

    # Evaluation
    model.eval()
    y_trues, y_preds = [], []
    with torch.no_grad():
        for batch in valid_loader:
            batch = batch.to(device)
            out = model(batch.x, batch.edge_index, batch.batch)
            y_trues.extend(batch.y.view(-1).cpu().numpy())
            y_preds.extend(out.cpu().numpy())

    auc = roc_auc_score(y_trues, y_preds)
    print(f"\\nValidation AUC: {auc:.4f}")

    # Save results
    results_path = "/content/drive/MyDrive/drug_discovery_ml/results/sr_hse_gat.csv"
    pd.DataFrame({"y_true": y_trues, "y_pred": y_preds}).to_csv(results_path, index=False)
    print(f"Results saved: {results_path}")

if __name__ == "__main__":
    train()
"""

# Save to file
write_script("/content/drive/MyDrive/drug_discovery_ml/src/train_gnn_model.py", gnn_model_code)


In [None]:
print("\\nDeep learning advance model training...")
!python /content/drive/MyDrive/drug_discovery_ml/src/train_gnn_model.py


\nDeep learning advance model training...
No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
2025-05-19 14:33:03.388894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-19 14:33:03.415426: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-19 14:33:03.422620: E external/local_xla/xla/st