In [3]:
protein1 = "../data/proteins/105M_FPT_sequence.fasta"
protein2 = "../data/proteins/6VKV_GAG_sequence.fasta"
protein3 = "../data/proteins/7L5E_XPO1_sequence.fasta"
protein4 = "../data/proteins/8QYR_MYH7_sequence.fasta"

In [10]:
from Bio import SeqIO

def read_protein_sequence(protein_file):
    for record in SeqIO.parse(protein_file, "fasta"):
        return str(record.seq)

In [11]:
seq1 = read_protein_sequence(protein1)
seq2 = read_protein_sequence(protein2)
seq3 = read_protein_sequence(protein3)
seq4 = read_protein_sequence(protein4)

In [16]:
# Check if GPU is available and move model to GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
from transformers import AutoTokenizer, AutoModel
tokenizer_protein = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model_protein = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D")
model_protein.to(device)

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 320, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1026, 320, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-5): 6 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=320, out_features=320, bias=True)
            (key): Linear(in_features=320, out_features=320, bias=True)
            (value): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
    

In [29]:
import numpy as np

In [31]:
inputs = tokenizer_protein(seq1, return_tensors='pt', padding=True, truncation=True).to(device)
with torch.no_grad():
    outputs = model_protein(**inputs)
embeddings = outputs.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().numpy().tolist() 
print(embeddings)
print(len(embeddings[0]))
np.save('../data/proteins/105M_FPT_embeddings.npy', embeddings[0])

[[-0.08440539985895157, -0.17810046672821045, 0.06381502002477646, -0.041964415460824966, 0.13670435547828674, -0.2565353810787201, -0.02754983864724636, -0.09023530036211014, -0.31054532527923584, 0.09387729316949844, 0.24869561195373535, 0.10899195820093155, 0.1580665558576584, 0.06920836865901947, 0.10093522071838379, -0.18664395809173584, 0.20787526667118073, 0.08122434467077255, -0.21663562953472137, -0.04636240378022194, -0.039435602724552155, 0.020030716434121132, 0.008536497130990028, -0.028127651661634445, -0.029853977262973785, 0.21462775766849518, -0.04094013571739197, 0.047568630427122116, 0.08032993227243423, 0.014495407231152058, 0.01587974838912487, 0.10621145367622375, 0.06469511985778809, -0.06898464262485504, -0.1402735412120819, -0.24367210268974304, -0.021571319550275803, 0.1391163170337677, 0.19756728410720825, 0.07100402563810349, 0.015409614890813828, 0.0403430201113224, 0.27035805583000183, 0.1771620362997055, -0.08293574303388596, 0.04407988488674164, -0.840522

In [34]:
inputs = tokenizer_protein(seq2, return_tensors='pt', padding=True, truncation=True).to(device)
with torch.no_grad():
    outputs = model_protein(**inputs)
embeddings = outputs.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().numpy().tolist() 
print(embeddings)
print(len(embeddings[0]))
np.save('../data/proteins/6VKV_GAG_embeddings.npy', embeddings[0])

[[0.057033587247133255, -0.052961867302656174, 0.11849178373813629, 0.04909713193774223, 0.05320388451218605, 0.023269565775990486, 0.034086450934410095, -0.0943056046962738, -0.016134075820446014, -0.09448009729385376, 0.01747436262667179, 0.020732011646032333, -0.09003052860498428, 0.1760595738887787, 0.0922747403383255, -0.11031150817871094, 0.0330948606133461, 0.009745934046804905, 0.08089030534029007, 0.14297211170196533, -0.008050686679780483, 0.07196313887834549, -0.08659180998802185, 0.04547104611992836, 0.06079425662755966, 0.0843755453824997, -0.08441665768623352, -0.1410040706396103, 0.04363306611776352, 0.2032342553138733, -0.21420352160930634, -0.08232537657022476, 0.05655408278107643, -0.14617222547531128, 0.1494748592376709, -0.07217957079410553, 0.033101268112659454, -0.0801798552274704, 0.2613004148006439, 0.07712537795305252, 0.05165214464068413, 0.11996807157993317, 0.03248513862490654, 0.05212220922112465, 0.027155928313732147, 0.1609126329421997, -0.950058698654174

In [35]:
inputs = tokenizer_protein(seq3, return_tensors='pt', padding=True, truncation=True).to(device)
with torch.no_grad():
    outputs = model_protein(**inputs)
embeddings = outputs.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().numpy().tolist() 
print(embeddings)
print(len(embeddings[0]))
np.save('../data/proteins/7L5E_XPO1_embeddings.npy', embeddings[0])

[[0.1504795104265213, 0.0744948536157608, 0.031085355207324028, -0.03423982113599777, -0.08956242352724075, -0.1054726392030716, -0.05492104962468147, -0.19486695528030396, 0.0008804689277894795, -0.18807920813560486, 0.13329090178012848, 0.10194965451955795, 0.11309730261564255, 0.2090551257133484, 0.07632901519536972, -0.17513759434223175, 0.053030095994472504, -0.052062295377254486, -0.024393141269683838, 0.17564861476421356, -0.10490615665912628, 0.06051918864250183, 0.07719157636165619, 0.00010038646723842248, 0.014809741638600826, 0.2645825147628784, 0.010496622882783413, 0.043604664504528046, 0.11695817857980728, 0.18813025951385498, -0.02419338747859001, 0.19531390070915222, 0.10198120027780533, 0.03476477414369583, 0.19729425013065338, -0.2088090032339096, 0.08767088502645493, -0.013847757130861282, 0.15709935128688812, 0.23742687702178955, -0.01250639371573925, 0.11528065800666809, 0.19581787288188934, -0.1301369071006775, -0.07344453036785126, 0.20823338627815247, -0.9620688

In [36]:
inputs = tokenizer_protein(seq4, return_tensors='pt', padding=True, truncation=True).to(device)
with torch.no_grad():
    outputs = model_protein(**inputs)
embeddings = outputs.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().numpy().tolist() 
print(embeddings)
print(len(embeddings[0]))
np.save('../data/proteins/8QYR_MYH7_embeddings.npy', embeddings[0])

[[0.05574580654501915, -0.04580986127257347, 0.1398656815290451, 0.10241352766752243, 0.15663672983646393, -0.02668864093720913, 0.049600400030612946, -0.14675508439540863, -0.06670176982879639, -0.17593850195407867, 0.061651550233364105, 0.10324995219707489, -0.08150521665811539, 0.13081581890583038, -0.0204180721193552, -0.15246888995170593, 0.08469201624393463, -0.016874069347977638, 0.08104796707630157, 0.21055012941360474, 0.04898565635085106, 0.0775671973824501, 0.026979777961969376, -0.08391032367944717, 0.08123581111431122, 0.19704313576221466, -0.11263865232467651, 0.025544488802552223, 0.10100516676902771, 0.2140166014432907, -0.15753021836280823, 0.1983373463153839, 0.07847511023283005, -0.16112825274467468, 0.10068850219249725, -0.19216962158679962, 0.028817737475037575, -0.08583217114210129, 0.25300517678260803, 0.1776425689458847, 0.07266074419021606, 0.13972468674182892, -0.09178066998720169, 0.01237425021827221, 0.17227321863174438, 0.21729616820812225, -1.2048189640045