# From Sequence to prediction

In [20]:
from transformers import BertTokenizer, BertModel
import torch 
import numpy as np
import json
import sys

import joblib
import warnings

from sklearn.exceptions import InconsistentVersionWarning

warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

In [21]:
np.set_printoptions(threshold=sys.maxsize, suppress=True)
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")

In [22]:
#load classifier
classifier = joblib.load("./../prediction/protein_function_classifier_with_hierarchy.pkl")
mlb = joblib.load("./../prediction/go_mlb_with_hierarchy.pkl")

In [23]:
def generate_embedding(sequence):
    spaced_sequence = " ".join(sequence)

    inputs = tokenizer(spaced_sequence, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0).numpy()
    return embeddings

In [24]:
def get_sequence_by_id(protein_id, json_file):
    try:
        with open(json_file, 'r') as file:
            data = json.load(file)
            for protein in data:
                if protein['protein_id'] == protein_id:
                    return protein['sequence']
            return f"Protein ID {protein_id} not found in the JSON file."
    except FileNotFoundError:
        return f"The file {json_file} does not exist."
    except json.JSONDecodeError:
        return f"The file {json_file} is not a valid JSON file."

Please provide your Protein ID here:

In [27]:
protein_id = "A0A0C5B5G6"
json_file = "./../raw_data/filtered_proteins.json"
example_sequence = get_sequence_by_id(protein_id, json_file)

print("Generating embeddings for the example sequence...")
embedding = generate_embedding(example_sequence)

Generating embeddings for the example sequence...


Now the embedding is created and we need to perform the inference query to get a protein function prediction:

In [28]:
new_protein_embedding = np.array(embedding).reshape(1, -1)

predicted_labels = classifier.predict(new_protein_embedding)

predicted_go_terms = mlb.inverse_transform(predicted_labels)
print(f"Predicted GO terms: {predicted_go_terms}")


Predicted GO terms: [('GO:0003674', 'GO:0005488', 'GO:0005515')]
