In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("pingzhili/chemberta-v2-finetuned-uspto-50k-classification")
model = AutoModel.from_pretrained("pingzhili/chemberta-v2-finetuned-uspto-50k-classification")
model.eval()

def get_embedding(smiles):
    with torch.no_grad():
        inputs = tokenizer(smiles, return_tensors="pt")
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        return embedding.detach().numpy().flatten()


Some weights of RobertaModel were not initialized from the model checkpoint at pingzhili/chemberta-v2-finetuned-uspto-50k-classification and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

# Drop columns with any empty rows
data = pd.read_csv('CycPeptMPDB_Peptide_All.csv', low_memory=False)

In [3]:
print(f"Initial number of rows: {len(data)}")
print(f'# of Columns before dropping: {data.shape[1]}')
data = data.drop_duplicates(subset='Structurally_Unique_ID')
print(f"Number of rows after dropping duplicate molecules: {len(data)}")

Initial number of rows: 8466
# of Columns before dropping: 247
Number of rows after dropping duplicate molecules: 7991


In [4]:
# Remove columns with any missing values
data = data.dropna(axis=1)
print(f"Number of columns after dropping those with missing values: {data.shape[1]}")
print(f"Columns remaining: {data.columns.tolist()}")

# Remove Permeabiltiy = -10
data = data[data['Permeability'] != -10]
print(f"Number of rows after removing Permeability = -10: {len(data)}")

Number of columns after dropping those with missing values: 226
Columns remaining: ['ID', 'Source', 'Year', 'Version', 'Original_Name_in_Source_Literature', 'Structurally_Unique_ID', 'SMILES', 'HELM', 'HELM_URL', 'Sequence', 'Sequence_LogP', 'Sequence_TPSA', 'Monomer_Length', 'Monomer_Length_in_Main_Chain', 'Molecule_Shape', 'Permeability', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', '

In [5]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

y = data['Permeability']

def get_emb_thread(smiles):
    return get_embedding(smiles)

with ThreadPoolExecutor() as executor:
    embeddings = list(tqdm(executor.map(get_emb_thread, data['SMILES']), total=len(data), desc="Generating embeddings"))

Generating embeddings: 100%|██████████| 7718/7718 [02:40<00:00, 48.08it/s]


In [6]:
print(f"Number of embeddings: {len(embeddings)}")
print(f"Size of y: {y.shape[0]}")

Number of embeddings: 7718
Size of y: 7718


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Convert embeddings list to numpy array
X = np.stack(embeddings)
y_array = y.values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_array, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 0.4937
RMSE: 0.6703
R2: 0.2945


In [8]:
from transformers import AutoTokenizer, AutoModel

# Load tokenizer and model
model_name = "DeepChem/ChemBERTa-77M-MTR"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding_chemberta_77(smiles):
    with torch.no_grad():
        inputs = tokenizer(smiles, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        attention_mask = inputs['attention_mask']     # [batch_size, seq_len]

        # Expand attention_mask to match token_embeddings dimensions
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

        # Perform mean pooling excluding padding tokens
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
        sum_mask = input_mask_expanded.sum(dim=1).clamp(min=1e-9)
        embedding = sum_embeddings / sum_mask

        return embedding.detach().numpy().flatten()



Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from concurrent.futures import ThreadPoolExecutor

# Multithreaded embedding generation for ChemBERTa-77M-MTR
def get_emb_chemberta_77_thread(smiles):
    return get_embedding_chemberta_77(smiles)

with ThreadPoolExecutor() as executor:
    embeddings_chemberta_77 = list(tqdm(executor.map(get_emb_chemberta_77_thread, data['SMILES']), total=len(data), desc="Generating ChemBERTa-77M embeddings"))

Generating ChemBERTa-77M embeddings: 100%|██████████| 7718/7718 [00:22<00:00, 350.60it/s]


In [10]:
print(f"Initial number of rows of Permeability: {len(y)}")
print(f"Number of embeddings for ChemBERTa-77M: {len(embeddings_chemberta_77)}")

Initial number of rows of Permeability: 7718
Number of embeddings for ChemBERTa-77M: 7718


In [11]:
# Train a Random Forest model using ChemBERTa-77M embeddings
X_chemberta_77 = np.stack(embeddings_chemberta_77)
X_train_77, X_test_77, y_train_77, y_test_77 = train_test_split(X_chemberta_77, y_array, test_size=0.2, random_state=42)

rf_chemberta_77 = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_chemberta_77.fit(X_train_77, y_train_77)

y_pred_77 = rf_chemberta_77.predict(X_test_77)

mae_77 = mean_absolute_error(y_test_77, y_pred_77)
rmse_77 = np.sqrt(mean_squared_error(y_test_77, y_pred_77))
r2_77 = r2_score(y_test_77, y_pred_77)

print(f"ChemBERTa-77M RF MAE: {mae_77:.4f}")
print(f"ChemBERTa-77M RF RMSE: {rmse_77:.4f}")
print(f"ChemBERTa-77M RF R2: {r2_77:.4f}")

ChemBERTa-77M RF MAE: 0.3924
ChemBERTa-77M RF RMSE: 0.5321
ChemBERTa-77M RF R2: 0.5554


In [2]:
from sklearn.ensemble import GradientBoostingRegressor

gbm_77 = GradientBoostingRegressor(random_state=42)
gbm_77.fit(X_train_77, y_train_77)
y_pred_gbm_77 = gbm_77.predict(X_test_77)

mae_gbm_77 = mean_absolute_error(y_test_77, y_pred_gbm_77)
rmse_gbm_77 = np.sqrt(mean_squared_error(y_test_77, y_pred_gbm_77))
r2_gbm_77 = r2_score(y_test_77, y_pred_gbm_77)

print(f"ChemBERTa-77M Gradient Boosting MAE: {mae_gbm_77:.4f}")
print(f"ChemBERTa-77M Gradient Boosting RMSE: {rmse_gbm_77:.4f}")
print(f"ChemBERTa-77M Gradient Boosting R2: {r2_gbm_77:.4f}")


NameError: name 'X_train_77' is not defined