In [7]:
from transformers import BertTokenizer, BertModel
import torch 
import numpy as np
import json

## Creating Embeddings for the protein functions using BERT

In [11]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")

In [12]:
def generate_embedding(sequence):
    spaced_sequence = " ".join(sequence)

    #can we use the padding option 
    inputs = tokenizer(spaced_sequence, return_tensors="pt", padding=True)  #TODO: model.eval() - using model without changes 

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0).numpy()
    return embeddings

Using a sample sequence to check whether embedding works:

In [15]:
example_sequence = "DLIPTSSKLVVDTSLQVKKAFFALVT"

print("Generating embeddings for the example sequence...")
embedding = generate_embedding(example_sequence)

print("Generated Embedding:", embedding)


Generating embeddings for the example sequence...
Generated Embedding: [-0.00255801  0.04745743  0.06023552 ...  0.01932982 -0.0061088
  0.10182918]


In [17]:
with open("./../raw_data/parsed_proteins.json", "r") as infile:
    parsed_data = json.load(infile)


protein_embeddings = []
for entry in parsed_data:
    protein_id = entry["protein_id"]
    sequence = entry["sequence"]
    
    embedding = generate_embedding(sequence)
    

    protein_embeddings.append({
        "protein_id": protein_id,
        "embedding": embedding.tolist(),
        "go_annotations": entry["go_annotations"]
    })

with open("protein_embeddings.json", "w") as outfile:
    json.dump(protein_embeddings, outfile, indent=4)

print("Protein embeddings saved to 'protein_embeddings.json'")

KeyboardInterrupt: 