In [6]:
# Load model directly
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = RobertaForSequenceClassification.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import pandas as pd

path = "../data/CycPeptMPDB_Peptide_All.csv"
data = pd.read_csv(path, low_memory=False)

data = data.drop_duplicates(subset=['Structurally_Unique_ID'])
data = data[data['Permeability'] != -10]

target = data['Permeability']
smiles = data['SMILES']

In [8]:
def tokenize_smiles(smiles):
    """Tokenize SMILES strings using the ChemBERTa tokenizer."""
    return tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")

features = tokenize_smiles(smiles.tolist())
print("Tokenization complete. Number of samples:", len(features['input_ids']))

Tokenization complete. Number of samples: 7718


In [9]:
from tqdm.notebook import tqdm

import torch

def get_embeddings(features, batch_size=64):
    model.eval()
    all_embeddings = []
    input_ids = features['input_ids']
    attention_mask = features['attention_mask']
    num_samples = input_ids.shape[0]
    for i in tqdm(range(0, num_samples, batch_size), desc="Embedding batches"):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                output_hidden_states=True
            )
            last_hidden = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_dim)
            mask = batch_attention_mask.unsqueeze(-1).expand(last_hidden.size())
            summed = (last_hidden * mask).sum(1)
            counts = mask.sum(1)
            embeddings = summed / counts
            all_embeddings.append(embeddings)
    return torch.cat(all_embeddings, dim=0)

embeddings = get_embeddings(features)
print("Embeddings shape:", embeddings.shape)

Embedding batches:   0%|          | 0/121 [00:00<?, ?it/s]

Embeddings shape: torch.Size([7718, 384])


In [10]:
import torch.nn as nn

# Use embeddings directly as they are already 2D (num_samples, hidden_dim)
cls_embeddings = embeddings

# Example: move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cls_embeddings = cls_embeddings.to(device)
target_tensor = torch.tensor(target.values, dtype=torch.float32).view(-1, 1).to(device)

class Regressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

In [11]:
from torch.utils.data import TensorDataset, DataLoader

# Use cls_embeddings and target_tensor as X and y
X = cls_embeddings
y = target_tensor

# Initialize the model, loss function, and optimizer
model = Regressor(X.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Create DataLoader for batching
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Training loop with early stopping
epochs = 1000
patience = 10
best_loss = float('inf')
epochs_no_improve = 0

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_X.size(0)
    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    # Early stopping check
    if epoch_loss < best_loss - 1e-5:
        best_loss = epoch_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Restore best model
model.load_state_dict(best_model_state)

Epoch 1/1000, Loss: 3.1168
Epoch 2/1000, Loss: 0.5930
Epoch 3/1000, Loss: 0.4941
Epoch 4/1000, Loss: 0.4510
Epoch 5/1000, Loss: 0.4110
Epoch 6/1000, Loss: 0.3893
Epoch 7/1000, Loss: 0.3766
Epoch 8/1000, Loss: 0.3724
Epoch 9/1000, Loss: 0.3493
Epoch 10/1000, Loss: 0.3449
Epoch 11/1000, Loss: 0.3336
Epoch 12/1000, Loss: 0.3298
Epoch 13/1000, Loss: 0.3242
Epoch 14/1000, Loss: 0.3152
Epoch 15/1000, Loss: 0.3119
Epoch 16/1000, Loss: 0.3115
Epoch 17/1000, Loss: 0.2979
Epoch 18/1000, Loss: 0.2918
Epoch 19/1000, Loss: 0.2934
Epoch 20/1000, Loss: 0.2886
Epoch 21/1000, Loss: 0.2957
Epoch 22/1000, Loss: 0.2893
Epoch 23/1000, Loss: 0.2848
Epoch 24/1000, Loss: 0.2839
Epoch 25/1000, Loss: 0.2736
Epoch 26/1000, Loss: 0.2781
Epoch 27/1000, Loss: 0.2801
Epoch 28/1000, Loss: 0.2876
Epoch 29/1000, Loss: 0.2647
Epoch 30/1000, Loss: 0.2729
Epoch 31/1000, Loss: 0.2846
Epoch 32/1000, Loss: 0.2660
Epoch 33/1000, Loss: 0.2684
Epoch 34/1000, Loss: 0.2769
Epoch 35/1000, Loss: 0.2668
Epoch 36/1000, Loss: 0.2536
E

<All keys matched successfully>

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Get predictions for the whole dataset
model.eval()
with torch.no_grad():
    y_pred = model(X).cpu().numpy()
    y_true = y.cpu().numpy()

mae = mean_absolute_error(y_true, y_pred)
mape = (abs((y_true - y_pred) / y_true)).mean() * 100
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 0.2922
MAPE: 5.00%
RMSE: 0.3994
R2: 0.7419
