In [3]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch

In [3]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from torch import autocast

# 1. Verify GPU Availability
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Number of GPUs available:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Please check your CUDA installation.")
    exit()

# 2. Load Dataset
df = pd.read_csv('merged.csv')
smiles_list = df['sequence'].tolist()
print(f"Number of SMILES strings: {len(smiles_list)}")

# 3. Load Tokenizer and Model
tokenizer = RobertaTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
model = RobertaModel.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')

# 4. Move Model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")
print("Model is on device:", next(model.parameters()).device)

# Define custom dataset to handle tokenization on the fly
class SMILES_Dataset(Dataset):
    def __init__(self, smiles_list, tokenizer, max_length=512):
        self.smiles_list = smiles_list
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        inputs = self.tokenizer(
            smiles,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_length,
            padding='max_length'  # Fixed padding improves efficiency
        )
        return {key: val.squeeze(0) for key, val in inputs.items()}

# 5. Create Dataset and DataLoader
dataset = SMILES_Dataset(smiles_list, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # You might need to reduce batch size if out of memory

# 6. Initialize Embeddings List
smiles_embeddings = []

# 7. Process SMILES in Batches using DataLoader
for batch in tqdm(dataloader, desc="Processing Batches"):
    # Move inputs to GPU
    inputs = {key: val.to(device) for key, val in batch.items()}
    
    # Generate Embeddings with Mixed Precision
    with torch.no_grad():
        with autocast('cuda', enabled=(device.type == 'cuda')):
            outputs = model(**inputs)
    
    # Extract [CLS] Token Embeddings (using the first token [CLS] as representation)
    embeddings = outputs.last_hidden_state[:, 0, :]
    
    # Convert to NumPy and move to CPU
    embedding_numpy = embeddings.cpu().numpy()
    
    # Append to Embeddings List
    smiles_embeddings.extend(embedding_numpy)

# 8. Verify Embeddings Count
assert len(smiles_embeddings) == len(smiles_list), "Mismatch between embeddings and SMILES count."

# 9. Add Embeddings to DataFrame
df['embeddings'] = smiles_embeddings

# 10. Save Embeddings
df.to_csv('smiles_with_embeddings.csv')  

print("Embeddings generated and saved successfully!")


PyTorch version: 2.4.1+cu118
CUDA available: True
Number of GPUs available: 1
GPU name: NVIDIA GeForce RTX 4060 Laptop GPU
Number of SMILES strings: 27953




Using device: cuda
Model is on device: cuda:0


Processing Batches: 100%|██████████| 437/437 [03:56<00:00,  1.84it/s]


Embeddings generated and saved successfully!
