In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name()}")

GPU available: True
GPU Name: NVIDIA RTX A5000


In [None]:
model = AutoModel.from_pretrained("../model/Molformer", deterministic_eval=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("../model/Molformer")

In [6]:
model

MolformerModel(
  (embeddings): MolformerEmbeddings(
    (word_embeddings): Embedding(2362, 768, padding_idx=2)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): MolformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x MolformerLayer(
        (attention): MolformerAttention(
          (self): MolformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (rotary_embeddings): MolformerRotaryEmbedding()
            (feature_map): MolformerFeatureMap(
              (kernel): ReLU()
            )
          )
          (output): MolformerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (in

In [None]:
# send model to GPU
device = "cuda"
model = model.to(device)

In [None]:
# Test
smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)Oc1ccccc1C(=O)O"]
inputs = tokenizer(smiles, padding=True, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
outputs.pooler_output

tensor([[-0.4407,  0.3902,  0.7989,  ..., -0.6081, -0.0200,  0.0103],
        [ 0.5943,  0.4527,  0.3437,  ...,  0.1520, -0.3464,  0.5590]])

# Data

In [None]:
def get_molformer_embeddings(smiles):
    """
    Get Molformer embeddings for a list of SMILES strings.
    """
    representations = []

    # Batch processing 
    batch_size = 2048
    print(f"Batch size: {batch_size}")
    for i in tqdm(range(0, len(smiles), batch_size)):
        batch = smiles[i:i + batch_size]
        batch_input = tokenizer(batch, padding=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**batch_input)
        representations.extend(outputs.pooler_output.cpu().numpy())

    representations_df = pd.DataFrame(representations)
    representations_df["smiles"] = smiles

    return representations_df

In [None]:
data = pd.read_parquet("../data/Data/Step2_TestData_Target2035.parquet")
data = data["smiles"].values.tolist()
smiles

In [None]:
representations_df = get_molformer_embeddings(smiles)
with open("dream35-molformer-emb.pkl", "wb") as f:
    representations_df.to_pickle(f)

In [None]:
smiles = pd.read_csv("../data/Data/14_public_domain_WDR91_ligands.csv")
smiles = smiles["smiles"].values.tolist()

representations_df = get_molformer_embeddings(smiles)
with open("14-ligands-dream35.pkl", "wb") as f:
    representations_df.to_pickle(f)