<a href="https://colab.research.google.com/github/JPChem22/Molecular-Property-Prediction-with-LLMs/blob/main/MolecPropPredWithLLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2024.9.4


In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import torch
import numpy as np
import deepchem as dc

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


#Option 1: Load and Preprocess Data

In [None]:
# Example using a CSV with 'SMILES' and 'Property' columns
try:
    df = pd.read_csv("your_data.csv")  # Replace with your data file
except FileNotFoundError:
    print("Error: your_data.csv not found. Please provide a valid CSV file.")
    exit()

In [None]:
# Handle missing values (important!)
df = df.dropna(subset=['SMILES', 'Property'])

smiles = df['SMILES'].tolist()
properties = df['Property'].tolist()

#Option 2: Loading the MoleculeNet Database

In [9]:
# Load Delaney dataset from MoleculeNet
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='Raw', splitter='random') #Use raw featurizer to get SMILES
train_dataset, valid_dataset, test_dataset = datasets

In [10]:
print(train_dataset.ids[:5]) #Prints the first 5 SMILES
print(train_dataset.y[:5]) #Prints the first 5 properties

['CC(C)(C)CO' 'CN(C(=O)COc1nc2ccccc2s1)c3ccccc3'
 'c1ccc2cc3c4cccc5cccc(c3cc2c1)c45' 'CCN2c1ccccc1N(C)C(=O)c3cccnc23 '
 'Oc1cccc(Cl)c1Cl']
[[ 1.2680994 ]
 [-0.87409083]
 [-2.60632916]
 [-0.13225031]
 [ 0.83707521]]


In [11]:
# Create a Pandas DataFrame
train_df = pd.DataFrame({'SMILES': train_dataset.ids, 'Property': train_dataset.y.flatten()})

In [12]:
#Now use train_df as your dataframe in the previous code
smiles = train_df['SMILES'].tolist()
properties = train_df['Property'].tolist()

#Tokenization

In [13]:
model_name = "seyonec/ChemBERTa-zinc-base-v1" # A good starting point
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoded_input = tokenizer(smiles, padding=True, truncation=True, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

#Data Splitting

In [14]:
train_smiles, test_smiles, train_properties, test_properties = train_test_split(
    encoded_input['input_ids'], properties, test_size=0.2, random_state=42
)
train_attention_mask, test_attention_mask, _, _ = train_test_split(
    encoded_input['attention_mask'], properties, test_size=0.2, random_state=42
)

In [15]:
#Convert to tensors if they aren't already
train_properties = torch.tensor(train_properties, dtype=torch.float32).unsqueeze(1)
test_properties = torch.tensor(test_properties, dtype=torch.float32).unsqueeze(1)

#Model Definition and Training

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1) # Regression task

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) # Adjust learning rate as needed
num_epochs = 3 # Adjust number of epochs as needed
batch_size = 32 # Adjust batch size based on your GPU memory


for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(train_smiles), batch_size):
        batch_inputs = train_smiles[i:i + batch_size].to(device)
        batch_masks = train_attention_mask[i:i + batch_size].to(device)
        batch_labels = train_properties[i:i + batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Evaluation

In [17]:
model.eval()
with torch.no_grad():
    test_predictions = []
    for i in range(0, len(test_smiles), batch_size):
        batch_inputs = test_smiles[i:i+batch_size].to(device)
        batch_masks = test_attention_mask[i:i+batch_size].to(device)
        batch_outputs = model(batch_inputs, attention_mask = batch_masks)
        test_predictions.extend(batch_outputs.logits.cpu().numpy().flatten())

test_predictions = torch.tensor(test_predictions)

mse = mean_squared_error(test_properties.cpu().numpy().flatten(), test_predictions.numpy())
r2 = r2_score(test_properties.cpu().numpy().flatten(), test_predictions.numpy())

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.3010728359222412
R-squared: 0.7274905443191528


#Saving the Model

In [18]:
model.save_pretrained("my_molecular_property_model")
tokenizer.save_pretrained("my_molecular_property_model")

print("Model saved!")

Model saved!


#Inputting My Own SMILES

In [19]:
def predict_property(smiles_string):
    """
    Predicts the property of a given SMILES string using the trained model.

    Args:
        smiles_string (str): The SMILES string of the molecule.

    Returns:
        float: The predicted property value.
    """
    inputs = tokenizer(smiles_string, return_tensors="pt").to(device)
    with torch.no_grad():  # Deactivate autograd for inference
        outputs = model(**inputs)
        prediction = outputs.logits.item()
    return prediction

In [20]:
# Example usage:
new_smiles = "CCOC(=O)C"  # Ethyl acetate
predicted_property = predict_property(new_smiles)
print(f"Predicted property for {new_smiles}: {predicted_property}")

Predicted property for CCOC(=O)C: 0.8890921473503113
