<a href="https://colab.research.google.com/github/Ishrak-DataScience/AIDrugDiscovery/blob/main/ISHRAK_NMG2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets rdkit pandas

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


**# -*- coding: utf-8 -*-
"""
NMG_ISHRAK_Cleaned.py
Refactored for clarity, efficiency, and removal of redundant training steps.
"""
**

In [2]:

import os
import torch
import pandas as pd
import time
from tqdm import tqdm

# Third-party imports
# Ensure these are installed: pip install torch transformers datasets rdkit pandas
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    pipeline,
    set_seed
)
from datasets import load_dataset
from rdkit import Chem
from rdkit.Chem import Descriptors

# Set device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")
set_seed(42)

🚀 Using device: cuda


In [3]:
# ==========================================
# 1. Configuration & Data Loading
# ==========================================
INPUT_FILENAME = "Components-smiles-cactvs.smi"  # Change this to your specific file path
CLEAN_DATA_FILE = "smiles.txt"
MODEL_NAME = "gpt2"
OUTPUT_DIR = "./smiles_gpt2"

# Optional: specific for Google Colab file uploading
if not os.path.exists(INPUT_FILENAME):
    try:
        from google.colab import files
        print("Upload your SMILES file:")
        uploaded = files.upload()
        INPUT_FILENAME = list(uploaded.keys())[0]
    except ImportError:
        print(f"⚠️ File {INPUT_FILENAME} not found and not in Colab.")

Upload your SMILES file:


Saving Components-smiles-cactvs.smi to Components-smiles-cactvs.smi


In [4]:
# ==========================================
# 2. Data Processing (Clean & Validate)
# ==========================================
print(f"\n🧪 Processing {INPUT_FILENAME}...")

smiles_list = []
# Read file
try:
    with open(INPUT_FILENAME, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            # Assumes SMILES is the first column
            first_col = line.split()[0]
            smiles_list.append(first_col)
except FileNotFoundError:
    print("❌ Input file not found. Please check the path.")
    smiles_list = []

print(f"Total lines read: {len(smiles_list)}")

# Validate with RDKit
valid_smiles = []
seen = set()

print("Validating SMILES with RDKit...")
for s in tqdm(smiles_list):
    if s in seen:
        continue
    mol = Chem.MolFromSmiles(s)
    if mol is not None:
        valid_smiles.append(s)
        seen.add(s)

print(f"✅ Kept {len(valid_smiles)} valid/unique SMILES out of {len(smiles_list)}.")

# Save processed data for the model
if valid_smiles:
    pd.Series(valid_smiles).to_csv(CLEAN_DATA_FILE, index=False, header=False)
    print(f"📁 Saved training data to {CLEAN_DATA_FILE}")
else:
    raise ValueError("No valid SMILES found. Aborting.")


🧪 Processing Components-smiles-cactvs.smi...
Total lines read: 48632
Validating SMILES with RDKit...


  0%|          | 0/48632 [00:00<?, ?it/s][16:31:19] SMILES Parse Error: syntax error while parsing: [Ru]|1|2|3|4|5|6|7|8(|[CH]9[C]|1=[C]|2[C]|3=[C]|49)|[C]%10=[C]|5[C]|6([C]|7=[C]|8%10)c%11cn(nn%11)c%12ccc(cc%12)[S](N)(=O)=O
[16:31:19] SMILES Parse Error: check for mistakes around position 5:
[16:31:19] [Ru]|1|2|3|4|5|6|7|8(|[CH]9[C]|1=[C]|2[C]
[16:31:19] ~~~~^
[16:31:19] SMILES Parse Error: Failed parsing SMILES '[Ru]|1|2|3|4|5|6|7|8(|[CH]9[C]|1=[C]|2[C]|3=[C]|49)|[C]%10=[C]|5[C]|6([C]|7=[C]|8%10)c%11cn(nn%11)c%12ccc(cc%12)[S](N)(=O)=O' for input: '[Ru]|1|2|3|4|5|6|7|8(|[CH]9[C]|1=[C]|2[C]|3=[C]|49)|[C]%10=[C]|5[C]|6([C]|7=[C]|8%10)c%11cn(nn%11)c%12ccc(cc%12)[S](N)(=O)=O'
  1%|          | 272/48632 [00:00<00:17, 2703.80it/s][16:31:19] Explicit valence for atom # 23 Be, 4, is greater than permitted
  1%|          | 543/48632 [00:00<00:19, 2493.54it/s][16:31:19] SMILES Parse Error: syntax error while parsing: O=C1NC(=O)c2cc3c([Ru]|4|5(|S6CCS|4CCS|5CC6)(N=C=S)|n7ccccc37)cc12
[16:31:19] S

✅ Kept 45757 valid/unique SMILES out of 48632.
📁 Saved training data to smiles.txt


In [5]:
# ==========================================
# 3. Model & Tokenizer Setup
# ==========================================
print(f"\n🤖 Loading {MODEL_NAME} model and tokenizer...")

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token # GPT-2 needs this fix
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.to(device)



🤖 Loading gpt2 model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# ==========================================
# 5. Dataset Preparation
# ==========================================
print("\n📚 Preparing dataset...")
dataset = load_dataset("text", data_files={"train": CLEAN_DATA_FILE})

def tokenization_function(batch):
    return tokenizer(batch["text"], truncation=True, max_length=1024)

tokenized_datasets = dataset.map(tokenization_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



📚 Preparing dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/45757 [00:00<?, ? examples/s]

In [None]:
# ==========================================
# 6. Training
# ==========================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=1,              # Increase to 3-5 for better results
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,   # Simulates larger batch size
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),  # Use mixed precision if on GPU
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

print("\n🏋️ Starting training...")
trainer.train()

# Save the final model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Fine-tuning complete! Model saved to {OUTPUT_DIR}")


🏋️ Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.2021
100,1.5617
150,1.3952
200,1.3216
250,1.2603
300,1.2504
350,1.2282


In [None]:
# ==========================================
# 7. Generation & Validation
# ==========================================
print("\n⚗️ Generating new molecules...")

# Load the Fine-Tuned Model
gen_pipeline = pipeline(
    "text-generation",
    model=OUTPUT_DIR,
    tokenizer=OUTPUT_DIR,
    device=0 if torch.cuda.is_available() else -1
)

seed_text = "C" # Starting atom
generated_outputs = gen_pipeline(
    seed_text,
    max_length=60,
    num_return_sequences=50, # Generate 50 molecules
    do_sample=True,
    top_k=50
)

In [None]:

# Validate generated output
valid_generated = []
print("Validating generated molecules...")

for output in generated_outputs:
    # Extract text and clean whitespace
    smi = output["generated_text"].strip().split()[0]

    # Check validity
    if Chem.MolFromSmiles(smi):
        valid_generated.append(smi)


In [None]:
# Remove duplicates
valid_generated = list(dict.fromkeys(valid_generated))

print(f"✨ Generated {len(valid_generated)} valid unique SMILES.")

In [None]:
# Save results
timestamp = time.strftime("%Y%m%d_%H%M%S")
output_csv = f"generated_valid_{timestamp}.csv"
pd.Series(valid_generated, name="SMILES").to_csv(output_csv, index=False)
print(f"💾 Saved generated molecules to {output_csv}")