In [1]:
# Load model directly
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = RobertaForSequenceClassification.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

path = "../data/CycPeptMPDB_Peptide_All.csv"
data = pd.read_csv(path, low_memory=False)

data = data.drop_duplicates(subset=['Structurally_Unique_ID'])
data = data[data['Permeability'] != -10]

target = data['Permeability']
smiles = data['SMILES']

In [3]:
def tokenize_smiles(smiles):
    """Tokenize SMILES strings using the ChemBERTa tokenizer."""
    return tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")

features = tokenize_smiles(smiles.tolist())
print("Tokenization complete. Number of samples:", len(features['input_ids']))

Tokenization complete. Number of samples: 7718


In [4]:
from tqdm.notebook import tqdm

import torch

def get_embeddings(features, batch_size=64):
    model.eval()
    all_embeddings = []
    input_ids = features['input_ids']
    attention_mask = features['attention_mask']
    num_samples = input_ids.shape[0]
    for i in tqdm(range(0, num_samples, batch_size), desc="Embedding batches"):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                output_hidden_states=True
            )
            last_hidden = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_dim)
            mask = batch_attention_mask.unsqueeze(-1).expand(last_hidden.size())
            summed = (last_hidden * mask).sum(1)
            counts = mask.sum(1)
            embeddings = summed / counts
            all_embeddings.append(embeddings)
    return torch.cat(all_embeddings, dim=0)

embeddings = get_embeddings(features)
print("Embeddings shape:", embeddings.shape)

Embedding batches:   0%|          | 0/121 [00:00<?, ?it/s]

Embeddings shape: torch.Size([7718, 384])


# Train the Model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_jobs=-1))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(embeddings, target, test_size=0.2, random_state=42)

# Fit pipeline
pipeline.fit(X_train, y_train)

# Ensemble averaging: train several RFs on different seeds and average predictions
n_ensemble = 10
ensemble_preds = []

for seed in range(n_ensemble):
    ensemble_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(random_state=seed, n_jobs=-1))
    ])
    ensemble_pipeline.fit(X_train, y_train)
    preds = ensemble_pipeline.predict(X_test)
    ensemble_preds.append(preds)

# Average predictions
y_pred = np.mean(ensemble_preds, axis=0)

In [6]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

# Predict on training set
y_train_pred = pipeline.predict(X_train)

# Calculate metrics for training set
train_rmse = root_mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mape = (abs((y_train - y_train_pred) / y_train).mean()) * 100
train_r2 = r2_score(y_train, y_train_pred)

# Predict on test set
y_test_pred = pipeline.predict(X_test)

# Calculate metrics for test set
rmse = root_mean_squared_error(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
mape = (abs((y_test - y_test_pred) / y_test).mean()) * 100
r2 = r2_score(y_test, y_test_pred)

print("Training set performance:")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAE: {train_mae:.4f}")
print(f"MAPE: {train_mape:.2f}%")
print(f"R2: {train_r2:.4f}")

print("\nTest set performance:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"R2: {r2:.4f}")

Training set performance:
RMSE: 0.3348
MAE: 0.2405
MAPE: 4.12%
R2: 0.8173

Test set performance:
RMSE: 0.5372
MAE: 0.3948
MAPE: 6.79%
R2: 0.5470
