In [None]:
!pip install tape_proteins

In [None]:
!pip install transformers

In [None]:
import os, time
if not os.path.isfile("esmfold.model"):
  # download esmfold params
  os.system("apt-get install aria2 -qq")
  os.system("aria2c -q -x 16 https://colabfold.steineggerlab.workers.dev/esm/esmfold.model &")

  # install libs
  os.system("pip install -q omegaconf pytorch_lightning biopython ml_collections einops py3Dmol")
  os.system("pip install -q git+https://github.com/NVIDIA/dllogger.git")

  # install openfold
  commit = "6908936b68ae89f67755240e2f588c09ec31d4c8"
  os.system(f"pip install -q git+https://github.com/aqlaboratory/openfold.git@{commit}")

  # install esmfold
  os.system(f"pip install -q git+https://github.com/sokrypton/esm.git")

  # wait for Params to finish downloading...
  if not os.path.isfile("esmfold.model"):
    # backup source!
    os.system("aria2c -q -x 16 https://files.ipd.uw.edu/pub/esmfold/esmfold.model")
  else:
    while os.path.isfile("esmfold.model.aria2"):
      time.sleep(5)

In [None]:
import os
import sys
import os.path
from sys import platform
from pathlib import Path

In [None]:
import sys
import time
import torch
import numpy as np
import pandas as pd
import pickle
import argparse
import requests
import subprocess

In [None]:
from torch import nn
from torch.utils import data as data

In [None]:
from tape import datasets
from tape import TAPETokenizer
from tape import ProteinBertForMaskedLM

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
import sys
sys.path.append('/content/gdrive/MyDrive/function_predictor/code')

In [None]:
from Z01_ModifiedModels import *

In [None]:
from pathlib import Path

In [None]:
from Bio import SeqIO
from tqdm.auto import tqdm

In [None]:
from transformers import BertModel, BertTokenizer
from transformers import AlbertModel, AlbertTokenizer
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForMaskedLM, ElectraModel
from transformers import T5EncoderModel, T5Tokenizer
from transformers import XLNetModel, XLNetTokenizer

In [None]:
import esm

In [None]:
from glob import glob
from Bio.Align.Applications import MafftCommandline

Helper Classes

In [None]:
class LoaderClass(data.Dataset):
    def __init__(self, input_ids, attention_mask):
        super().__init__()
        self.input_ids = input_ids
        self.attention_mask = attention_mask
    def __len__(self):
        return self.input_ids.shape[0]
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
    def forward(self, x,target = None):
        return (x,)

Function to generate embeddings

In [None]:
def N03_embedding_LM(dataset_nme, model_select, data_folder, input_seqs_fasta_file, output_file_name_header, pretraining_name=None, batch_size=100, xlnet_mem_len=512):
    assert model_select in available_models, "query model is not found, currently support ESM-1b, TAPE, BERT, AlBERT, Electra, T5, and Xlnet !!"
    input_file = data_folder / input_seqs_fasta_file  # data path (fasta)
    output_file = data_folder / (output_file_name_header + model_select + ".p")

    # Load the model based on the selection
    if model_select == "ESM_2_650":
        model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
    elif model_select == 'ESM_2_3B':
        model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
    elif model_select == 'ESM_2_15B':
        model, alphabet = esm.pretrained.esm2_t48_15B_UR50D()
    else:
        raise ValueError("Invalid model selected")

    batch_converter = alphabet.get_batch_converter()

    # Read sequences from FASTA file
    data_set = []
    for seq_record in SeqIO.parse(input_file, "fasta"):
        data_set.append((str(seq_record.id), str(seq_record.seq)))

    # Process in chunks
    chunk_size = 2500 if model_select == "ESM_2_650" else 2000
    data_set_list = [data_set[i:i + chunk_size] for i in range(0, len(data_set), chunk_size)]

    # New dictionary to store embeddings with sequence names as keys
    seq_embeddings_dict = {}
    for data_set_id, one_data_set in enumerate(data_set_list):
        model.eval()
        model.cuda()
        for i in range(0, len(one_data_set), batch_size):
            print(i, "out of", len(one_data_set), "; ", data_set_id, "out of", len(data_set_list))
            batch = one_data_set[i:i+batch_size] if i+batch_size <= len(one_data_set) else one_data_set[i:]
            batch_labels, batch_strs, batch_tokens = batch_converter(batch)
            batch_tokens = batch_tokens.cuda()
            with torch.no_grad():
                # Determine the correct representation layer based on the model selected
                if model_select == 'ESM_2_650':
                    repr_layer = 33
                elif model_select == 'ESM_2_3B':
                    repr_layer = 36
                elif model_select == 'ESM_2_15B':
                    repr_layer = 48
                else:
                    raise ValueError("Invalid model selected")

                results = model(batch_tokens, repr_layers=[repr_layer])
            results = results["representations"][repr_layer].cpu().detach()

            # Store embeddings in the dictionary
            for j, (seq_id, seq) in enumerate(batch):
                #seq_embedding = results[j, 1: len(seq) + 1].mean(0).numpy()
                seq_embedding = results[j, 1: len(seq) + 1]
                seq_embeddings_dict[seq_id] = seq_embedding

    # Serialize the dictionary to a file for persistent storage
    output_tensor_file = data_folder / (output_file_name_header + model_select + "_embeddings_tensor.pt")
    torch.save(seq_embeddings_dict, output_tensor_file)

    print("Embeddings tensor dictionary saved to", output_tensor_file)
    return seq_embeddings_dict

Main

In [None]:
embedding_step_code = "embedding_"
dataset_names = ["GFP", "PafAVariants", "GB1"]
selected_dataset_name = dataset_names[2]
data_directory_path = Path("/content/gdrive/MyDrive/function_predictor/GB1-Dataset-FewToMore")
input_fasta_filename = "low_vs_high.fasta"

In [None]:
available_models = ["ESM_2_650", "ESM_2_3B", "ESM_2_15B"]
selected_model = available_models[0]
pretrained_model_filename = "pretrained_" + selected_dataset_name + "_epoch5.pt"
output_filename_prefix = "emb_residue_level_"+ embedding_step_code + selected_dataset_name + "_embedding_"
batch_size = 100
model_memory_length = 512

In [None]:
embeddings = N03_embedding_LM(selected_dataset_name, selected_model, data_directory_path, input_fasta_filename, output_filename_prefix, pretrained_model_filename, batch_size, model_memory_length)
print("*" * 50)
print(embedding_step_code + "Done!")

In [None]:
loaded_embeddings = torch.load(data_directory_path / (output_filename_prefix + selected_model + "_embeddings_tensor.pt"))

In [None]:
len(embeddings)

In [26]:
embeddings.get('Sequence0').shape

torch.Size([265, 1280])

In [None]:
embedding_tensor = loaded_embeddings.get('Sequence0')

In [None]:
embeddings['Sequence0']

In [None]:
embedding_tensor

In [None]:
embedding_tensor.shape