In [1]:
# Load model directly
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = RobertaForSequenceClassification.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

path = "../data/CycPeptMPDB_Peptide_All.csv"
data = pd.read_csv(path, low_memory=False)

data = data.drop_duplicates(subset=['Structurally_Unique_ID'])
data = data[data['Permeability'] != -10]

target = data['Permeability']
smiles = data['SMILES']

In [3]:
def tokenize_smiles(smiles):
    """Tokenize SMILES strings using the ChemBERTa tokenizer."""
    return tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")

features = tokenize_smiles(smiles.tolist())
print("Tokenization complete. Number of samples:", len(features['input_ids']))

Tokenization complete. Number of samples: 7718


In [4]:
from tqdm.notebook import tqdm

import torch

def get_embeddings(features, batch_size=64):
    model.eval()
    all_embeddings = []
    input_ids = features['input_ids']
    attention_mask = features['attention_mask']
    num_samples = input_ids.shape[0]
    for i in tqdm(range(0, num_samples, batch_size), desc="Embedding batches"):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                output_hidden_states=True
            )
            last_hidden = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_dim)
            mask = batch_attention_mask.unsqueeze(-1).expand(last_hidden.size())
            summed = (last_hidden * mask).sum(1)
            counts = mask.sum(1)
            embeddings = summed / counts
            all_embeddings.append(embeddings)
    return torch.cat(all_embeddings, dim=0)

embeddings = get_embeddings(features)
print("Embeddings shape:", embeddings.shape)

Embedding batches:   0%|          | 0/121 [00:00<?, ?it/s]

Embeddings shape: torch.Size([7718, 384])


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error

# Convert embeddings tensor to numpy array
X = embeddings.cpu().numpy()
y = target.values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train pipeline
rf_pipeline.fit(X_train, y_train)

y_pred = rf_pipeline.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = (abs((y_test - y_pred) / y_test)).mean() * 100
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"R2: {r2:.4f}")

RMSE: 0.5352
MAE: 0.3946
MAPE: 6.79%
R2: 0.5502


In [8]:
from sklearn.model_selection import GridSearchCV

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Define parameter grid for RandomForestRegressor
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

# Use the same pipeline as before
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)

# Optionally, update rf_pipeline with best parameters
rf_pipeline.set_params(**grid_search.best_params_)
rf_pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END rf__max_depth=None, rf__min_samples_leaf=2, rf__min_samples_split=2, rf__n_estimators=100; total time= 1.3min
[CV] END rf__max_depth=None, rf__min_samples_leaf=2, rf__min_samples_split=5, rf__n_estimators=100; total time= 1.3min
[CV] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=5, rf__n_estimators=100; total time= 1.3min
[CV] END rf__max_depth=None, rf__min_samples_leaf=2, rf__min_samples_split=5, rf__n_estimators=100; total time= 1.3min
[CV] END rf__max_depth=None, rf__min_samples_leaf=2, rf__min_samples_split=2, rf__n_estimators=100; total time= 1.3min
[CV] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100; total time= 1.4min
[CV] END rf__max_depth=None, rf__min_samples_leaf=2, rf__min_samples_split=2, rf__n_estimators=100; total time= 1.4min
[CV] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=5, rf__n_estimators=100; total 

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

# Evaluate the fine-tuned RandomForestRegressor on the test set

y_pred_finetuned = rf_pipeline.predict(X_test)

rmse_finetuned = root_mean_squared_error(y_test, y_pred_finetuned)
mae_finetuned = mean_absolute_error(y_test, y_pred_finetuned)
mape_finetuned = (abs((y_test - y_pred_finetuned) / y_test)).mean() * 100
r2_finetuned = r2_score(y_test, y_pred_finetuned)

print(f"Fine-tuned RMSE: {rmse_finetuned:.4f}")
print(f"Fine-tuned MAE: {mae_finetuned:.4f}")
print(f"Fine-tuned MAPE: {mape_finetuned:.2f}%")
print(f"Fine-tuned R2: {r2_finetuned:.4f}")

Fine-tuned RMSE: 0.5355
Fine-tuned MAE: 0.3940
Fine-tuned MAPE: 6.77%
Fine-tuned R2: 0.5497
