## **Notebook to create ProtT5 Embeddings**

Printing info about the available graphics engine....

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Install the necessary requirements
(Necessary files need to be saved in the content directory!)
- install requirements.txt from the EAT repository
- install the bioembeddings package

In [None]:
!pip install -r requirements.txt
!pip install bio-embeddings[all]

Import the necessary packages...

In [None]:
import time
import h5py
from bio_embeddings.embed import ProtTransT5XLU50Embedder

Class Embedder from https://github.com/Rostlab/EAT.git

In [None]:
class Embedder():
    def __init__(self):
        self.embedder = ProtTransT5XLU50Embedder(half_model=True)

    def write_embeddings(self, emb_p, embds):
        with h5py.File(str(emb_p), "w") as hf:
            for sequence_id, embedding in embds.items():
                # noinspection PyUnboundLocalVariable
                hf.create_dataset(sequence_id, data=embedding)
        return None

    def get_per_residue_embeddings(self, id2seq):
        fasta_ids, seqs = zip(*[(fasta_id, seq)
                              for fasta_id, seq in id2seq.items()])
        print("Start generating embeddings. This process might take a few minutes.")
        start = time.time()
        per_residue_embeddings = list(self.embedder.embed_many(list(seqs)))
        id2embd = { fasta_id: per_residue_embeddings[idx]
                       for idx, fasta_id in enumerate(list(fasta_ids))
                   }
        print("Creating embeddings took: {:.4f}[s]".format(time.time()-start))
        return id2embd

    def get_per_sequence_embeddings(self, id2seq):
        fasta_ids, seqs = zip(*[(fasta_id, seq)
                              for fasta_id, seq in id2seq.items()])
        print("Start generating embeddings. This process might take a few minutes.")
        start = time.time()
        per_residue_embeddings = list(self.embedder.embed_many(list(seqs)))
        id2embd = { fasta_id: per_residue_embeddings[idx].mean(axis=0)
                       for idx, fasta_id in enumerate(list(fasta_ids))
                   }
        print("Creating embeddings took: {:.4f}[s]".format(time.time()-start))
        return id2embd

Save all new domains from holdout-fasta into a dictionary

In [None]:
lines = open('holdout390.fasta', 'r').read().split('\n')
fasta_id = ''
sequence = ''
domains = {}
for line in lines:
  if line.__contains__(">"):
    fasta_id = line.replace(">", "")
  else:
    sequence = line
    domains[fasta_id] = sequence

Create instance of Embedder class and create embeddings for the created dictionary and safe to .h5 file...

In [None]:
my_embedder = Embedder()
embeddings = my_embedder.get_per_residue_embeddings(domains)
my_embedder.write_embeddings('temporal_holdout_set.h5', embeddings)

Start generating embeddings. This process might take a few minutes.
Creating embeddings took: 34.5840[s]


Print out information about the created embeddings to check outcome

In [None]:
filename = "temporal_holdout_set.h5"

h5 = h5py.File(filename,'r')

for key in h5.keys():
  print(f"{h5[key]}")

h5.close()