In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt

Collecting jupyter (from -r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt (line 2))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting datasets (from -r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt (line 7))
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting jupyterlab (from jupyter->-r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt (line 2))
  Downloading jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt (line 6))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r /content/drive/MyDrive/NNTI_Project_files/Project_Files/requirements.txt (line 6))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x8

In [3]:
!pip install wandb



In [4]:
import torch
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random
import wandb

In [5]:
wandb.init(project="NNTI_Project_v2")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msolankishaique[0m ([33msolankishaique-max-planck-institute-for-informatics[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
#Load Dataset

In [7]:
# specify dataset name and model name
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [8]:
# load the dataset from HuggingFace
dataset = load_dataset(DATASET_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

lipophilicity.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

In [9]:
# Explore the dataset
# For example, print the column names and display a few sample rows
# TODO: your code goes here
print(dataset['train'].column_names)
print(dataset['train'].to_pandas().head())

['SMILES', 'label']
                                              SMILES  label
0            Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14   3.54
1  COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...  -1.18
2             COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl   3.69
3  OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...   3.37
4  Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...   3.10


In [17]:
# define a PyTorch Dataset class for handling SMILES strings and targets

# TODO: your code goes here
class SMILESDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.prepare_data()

    def prepare_data(self):
        processed_data = []

        for data_point in tqdm(self.dataset):
            encoded_input = self.tokenizer(
                data_point["SMILES"],
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            target_tensor = torch.tensor(data_point["label"], dtype=torch.float)
            processed_data.append({
                "input_ids": encoded_input["input_ids"].squeeze(0),
                "attention_mask": encoded_input["attention_mask"].squeeze(0),
                "target": target_tensor
            })
        return processed_data

    def get_data(self):
        return self.data


# ---Step 2 : Split Dataset---

In [19]:
# tokenize the data
# load a pre-trained tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

In [20]:
# split the data into training and test datasets
# TODO: your code goes here
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [None]:
# construct Pytorch data loaders for both train and test datasets
BATCH_SIZE = 16 # adjust based on memory constraints

# TODO: your code goes here
train_dataset = SMILESDataset(train_dataset, tokenizer)
test_dataset = SMILESDataset(test_dataset, tokenizer)
train_data = train_dataset.get_data()
test_data = test_dataset.get_data()

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

  0%|          | 0/3360 [00:00<?, ?it/s]

  0%|          | 0/840 [00:00<?, ?it/s]

## --- Step 3: Load Model ---

In [22]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

In [30]:
# We need to add a regression head on the language model as we are doing a regression task.

# specify model with a regression head

class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model, hidden_dim=768):
        # Initialize the parent nn.Module class
        super(MoLFormerWithRegressionHead, self).__init__()

        self.base_model = base_model
        self.regression_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)

In [31]:
# initialize the regression model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model = MoLFormerWithRegressionHead(model).to(device)

## --- Step 4: Training ---

In [32]:
# TODO: your code goes here
optimizer = torch.optim.AdamW(regression_model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()
wandb.watch(regression_model , log = "all")

EPOCHS = 10
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = regression_model(input_ids, attention_mask)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        wandb.log({"train_loss" : loss.item() , "epoch": epoch+1})

    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1: Loss = 1.3700128945566359


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2: Loss = 0.6568574967838469


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3: Loss = 0.4768287739938214


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4: Loss = 0.3806371597307069


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5: Loss = 0.3114273680107934


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6: Loss = 0.2701414291702566


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7: Loss = 0.22869944742747716


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8: Loss = 0.19693737370627268


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9: Loss = 0.18087897891444818


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10: Loss = 0.16420114815589928


## --- Step 5: Evaluation ---


In [33]:
# TODO: your code goes here

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

regression_model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        outputs = regression_model(input_ids, attention_mask).squeeze(1)
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(targets.cpu().numpy())

baseline_mse = mean_squared_error(actuals, predictions)
baseline_rmse = np.sqrt(baseline_mse)
baseline_mae = mean_absolute_error(actuals, predictions)
baseline_r2 = r2_score(actuals, predictions)
print(f"Test MSE: {baseline_mse}")
print(f"RMSE: {baseline_rmse}")
print(f"MAE: {baseline_mae}")
print(f"R2 Score: {baseline_r2}")
wandb.log({"test_mse": baseline_mse, "test_rmse": baseline_rmse, "test_mae": baseline_mae, "test_r2": baseline_r2})

Test MSE: 0.4156876628159542
RMSE: 0.6447384452752559
MAE: 0.48812631380832977
R2 Score: 0.704763197272469


## Saving Pre Fine-tuning model - Baseline

In [34]:
state = {"epoch" : epoch , "state_dict" : regression_model.state_dict() , "optimizer" : optimizer.state_dict()}

torch.save(state, "/content/drive/MyDrive/NNTI_Project_files/Project_Files/model_saves/task_1_v2/task_1_baseline_model.pth")

# 2.Add Unsupervised Finetuning
In this step, you will perform unsupervised fine-tuning on the training dataset. This means the model will leverage only the SMILES strings without any corresponding labels to adapt its understanding of the data distribution. By familiarizing the model with the patterns and structure of the SMILES strings, you can potentially enhance its performance on downstream supervised tasks.

For this fine-tuning, you will use the Masked Language Modeling (MLM) objective, where the model learns to predict randomly masked tokens within the input sequence. Remember to save the fine-tuned model for later use.

In [35]:
# TODO: your code goes here

wandb.init(project="NNTI_Project_v2" , name = "unsupervised_fine_tuning_session_wo_regression_head" , reinit=True)

0,1
epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
test_mae,▁
test_mse,▁
test_r2,▁
test_rmse,▁
train_loss,█▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
test_mae,0.48813
test_mse,0.41569
test_r2,0.70476
test_rmse,0.64474
train_loss,0.26665


In [None]:
class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model, hidden_dim=768):
        # Initialize the parent nn.Module class
        super(MoLFormerWithRegressionHead, self).__init__()

        self.base_model = base_model
        self.regression_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)

In [37]:
model_FT_wo_reg = AutoModelForMaskedLM.from_pretrained(MODEL_NAME, trust_remote_code=True)

optimizer_FT_wo_reg = torch.optim.AdamW(model_FT_wo_reg.parameters(), lr=2e-5)

EPOCHS = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_FT_wo_reg.to(device)
model_FT_wo_reg.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass with MLM objective
        outputs = model_FT_wo_reg(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer_FT_wo_reg.zero_grad()
        loss.backward()
        optimizer_FT_wo_reg_tune.step()

        wandb.log({"train_loss": loss.item(), "epoch": epoch + 1})

    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1: Loss = 0.6916560940976654


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2: Loss = 0.02035269093300615


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3: Loss = 0.005733162880919519


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4: Loss = 0.0030933873722374084


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5: Loss = 0.001965833588370255


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6: Loss = 0.0013765303581576085


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7: Loss = 0.0010619717614082176


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8: Loss = 0.0007910723831238491


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9: Loss = 0.0006614127894863486


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10: Loss = 0.0005563608744913446


In [None]:
# saving the fine-tuned model
state_FT_wo_reg = {"epoch" : epoch , "state_dict" : model_FT_wo_reg.state_dict() , "optimizer" : optimizer_FT_wo_reg.state_dict()}

torch.save(state_FT_wo_reg, "/content/drive/MyDrive/NNTI_Project_files/Project_Files/model_saves/task_1_v2/task_1_FT_model_wo_reg.pth")

# 3.Fine-Tune for Comparison

---


After performing unsupervised fine-tuning on the training data, we now fine-tune the model on the regression task with the regression head. By comparing the performance of the model before and after unsupervised fine-tuning, you can evaluate how the unsupervised fine-tuning impacts the model's performance on our target task.


In [None]:
class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model, hidden_dim=2362):
        # Initialize the parent nn.Module class
        super(MoLFormerWithRegressionHead, self).__init__()

        self.base_model = base_model
        self.regression_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
        #print("Last Hidden State Shape:", last_hidden_state.shape)
        pooled_output = last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)

In [62]:
wandb.init(project="NNTI_Project_v2" , name = "Regression_with_FT_Model" , reinit=True)

FT_reg_model = MoLFormerWithRegressionHead(model_FT_wo_reg).to(device)

FT_reg_optimizer = torch.optim.AdamW(FT_reg_model.parameters(), lr=2e-5)

FT_reg_loss_fn = nn.MSELoss()

wandb.watch(FT_reg_model , log = "all")

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▃▁▁▂▂▁▁▁▁

0,1
epoch,1.0
train_loss,1.49689


In [63]:
EPOCHS = 10
for epoch in range(EPOCHS):
    FT_reg_model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device).unsqueeze(1)

        FT_reg_optimizer.zero_grad()
        outputs = FT_reg_model(input_ids, attention_mask)
        loss = FT_reg_loss_fn(outputs, targets)
        loss.backward()
        FT_reg_optimizer.step()
        total_loss += loss.item()

        wandb.log({"train_loss" : loss.item() , "epoch": epoch+1})

    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1: Loss = 1.3597664041178568


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2: Loss = 0.7998643562907264


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3: Loss = 0.5814067260140464


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4: Loss = 0.5044403340135302


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5: Loss = 0.3984703287482262


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6: Loss = 0.35066269852575804


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7: Loss = 0.30851409885854947


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8: Loss = 0.2667506538331509


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9: Loss = 0.23759157921941507


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10: Loss = 0.2148795244594415


In [64]:
# TODO: your code goes here

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

FT_reg_model.eval()
predictions_FT_reg = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        outputs = FT_reg_model(input_ids, attention_mask).squeeze(1)
        predictions_FT_reg.extend(outputs.cpu().numpy())
        actuals.extend(targets.cpu().numpy())

FT_reg_mse = mean_squared_error(actuals, predictions_FT_reg)
FT_reg_rmse = np.sqrt(FT_reg_mse)
FT_reg_mae = mean_absolute_error(actuals, predictions_FT_reg)
FT_reg_r2 = r2_score(actuals, predictions_FT_reg)
print(f"Test MSE FT Reg: {FT_reg_mse}")
print(f"RMSE FT Reg: {FT_reg_rmse}")
print(f"MAE FT Reg: {FT_reg_mae}")
print(f"R2 Score FT Reg: {FT_reg_r2}")
wandb.log({"Test MSE FT Reg": FT_reg_mse, "RMSE FT Reg": FT_reg_rmse, "MAE FT Reg": FT_reg_mae, "R2 Score FT Reg": FT_reg_r2})

Test MSE FT Reg: 0.4192227860410507
RMSE FT Reg: 0.6474741585893994
MAE FT Reg: 0.48995317174343483
R2 Score FT Reg: 0.7022524215829643
