# From Sequence to prediction

In [9]:
from transformers import BertTokenizer, BertModel
import torch 
import numpy as np
import json
import sys

import joblib
import warnings

from sklearn.exceptions import InconsistentVersionWarning

warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

In [10]:
np.set_printoptions(threshold=sys.maxsize, suppress=True)
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")

Now we are loading the classifier:

In [11]:

classifier = joblib.load("./../models/protein_function_classifier_rf.pkl")
mlb = joblib.load("./../models/go_mlb_rf.pkl")

In [12]:
def generate_embedding(sequence):
    spaced_sequence = " ".join(sequence)

    inputs = tokenizer(spaced_sequence, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0).numpy()
    return embeddings

In [13]:
def get_sequence_by_id(protein_id, json_file):
    try:
        with open(json_file, 'r') as file:
            data = json.load(file)
            for protein in data:
                if protein['protein_id'] == protein_id:
                    return protein['sequence']
            return f"Protein ID {protein_id} not found in the JSON file."
    except FileNotFoundError:
        return f"The file {json_file} does not exist."
    except json.JSONDecodeError:
        return f"The file {json_file} is not a valid JSON file."

Please provide your Protein ID here:

In [14]:
protein_id = "A0A087X1C5"
json_file = "./../raw_data/filtered_proteins.json"
example_sequence = get_sequence_by_id(protein_id, json_file)

print("Generating embeddings for the example sequence...")
embedding = generate_embedding(example_sequence)

Generating embeddings for the example sequence...


Now the embedding is created and we need to perform the inference query to get a protein function prediction:

In [15]:
new_protein_embedding = np.array(embedding).reshape(1, -1)

predicted_labels = classifier.predict(new_protein_embedding)

predicted_go_terms = mlb.inverse_transform(predicted_labels)
print(f"Predicted GO terms: {predicted_go_terms}")


Predicted GO terms: [('GO:0003674', 'GO:0003723', 'GO:0005102', 'GO:0005515', 'GO:0008092', 'GO:0008270', 'GO:0008324', 'GO:0015075', 'GO:0015631', 'GO:0016787', 'GO:0019899', 'GO:0019904', 'GO:0022857', 'GO:0030234', 'GO:0030545', 'GO:0030546', 'GO:0032555', 'GO:0042803', 'GO:0044877', 'GO:0046872', 'GO:0046914', 'GO:0046982', 'GO:0048018', 'GO:0060090', 'GO:0097159', 'GO:0097367', 'GO:0098772', 'GO:0140096', 'GO:0140677', 'GO:0140678', 'GO:1901363')]


If you want to see the embeddings, execute the following block

In [16]:
print("Embeddings: ")
print(embedding)

Embeddings: 
[ 0.01051687 -0.01371659 -0.08702055 -0.27543494  0.12567727 -0.07036006
 -0.04007847 -0.1526646  -0.0478871  -0.00317483  0.03922058  0.02364367
 -0.01420561  0.0004388   0.04825511 -0.04509901 -0.0141239   0.07379692
 -0.03288254 -0.09540947  0.01661189  0.07377736  0.02280229 -0.05433457
  0.16710903  0.02190277 -0.0481483   0.02541325  0.0663678  -0.03579415
 -0.16128212 -0.01293594 -0.01012319 -0.00787872  0.10964928 -0.01035678
  0.08901549  0.07282551 -0.03726026 -0.10245537  0.08092152  0.07985996
  0.08264763  0.06841749  0.06869935 -0.1824903   0.09051856 -0.09394525
  0.06872425  0.05473294  0.07810044 -0.04663794 -0.04541186 -0.03445394
  0.07969369 -0.03707615  0.03900292 -0.19981618  0.01975638 -0.0442605
 -0.01295741 -0.02251619  0.04137708 -0.03254373 -0.09185991 -0.02661838
 -0.05136071 -0.00914741  0.10501354 -0.10021848 -0.14759791  0.07029811
 -0.05987056  0.00496947  0.05662007  0.07041849 -0.09743576  0.04797089
 -0.00505347 -0.03980897 -0.01533495  0