In [1]:
from transformers import BertTokenizer, BertModel
import torch 
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


## Creating Embeddings for the protein functions using BERT

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = AutoModel.from_pretrained("Rostlab/prot_bert")

In [4]:
def generate_embedding(sequence):
    spaced_sequence = " ".join(sequence)
    inputs = tokenizer(spaced_sequence, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0).numpy()
    return embeddings

Using a sample sequence to check whether embedding works:

In [5]:
example_sequence = "DLIPTSSKLVVDTSLQVKKAFFALVT"

print("Generating embeddings for the example sequence...")
embedding = generate_embedding(example_sequence)

print("Generated Embedding:", embedding)
print("Embedding Shape:", embedding.shape)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Generating embeddings for the example sequence...
Generated Embedding: [-0.00255801  0.04745743  0.06023552 ...  0.01932982 -0.0061088
  0.10182918]
Embedding Shape: (1024,)
