In [3]:
from pyteomics import fasta
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import json
import os

In [4]:
#Data path
datadir = os.path.abspath("../../Data/")

# Data prep

In [6]:
import esm
import torch
import time
import gc
import os
%run "../scripts/data_processing.py"
%run "../scripts/node_edge_generation.py"
%run "../scripts/graph_functions.py"

#Data path
datadir = os.path.abspath("../../Data/")

In [7]:
#Start graph
startGraph("./pass_ent.txt", "./Results/")

#Submit query, retrieve sequences
uri = "bolt://localhost:7687" 
driver = GraphDatabase.driver(uri)
query_1 = [f'match (p:Protein)\n \
            where not p.subcellular_location contains "[]" \
            return p.uniprot as Uniprot, p.subcellular_location as Subcellular_location, p.seq as Sequence']
results_1 = run_query(query_1[0], driver)
display(results_1)

#Close connection
driver.close()

#Create subset with smaller sequences
smallseqs = results_1.loc[results_1["Sequence"].str.len() <= 1500]

Graph stopped
Database import successful!
Graph started


Unnamed: 0,Uniprot,Subcellular_location,Sequence
0,A2RRL7,[['membrane ; single-pass type i membrane prot...,MQRLPAATRATLILSLAFASLHSACSAEASSSNSSSLTAHHPDPGT...
1,A8MYZ6,"[['cytoplasm', 'nucleus']]",MAAKLRAHQVDVDPDFAPQSRPRSCTWPLPQPDLAGDEDGALGAGV...
2,O14804,[['cell membrane ; multi-pass membrane protein']],MRAVFIQGAEEHPAAFCYQVNGSCPRTVHTLGIQLVIYLACAAGML...
3,O75459,"[['nucleoli fibrillar center', 'mitochondria',...",MGFLRRLIYRRRPMIYVESSEESSDEQPDEVESPTQSQDSTPAEER...
4,O75912,"[['cell projection, axon', 'cell projection, d...",MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...
...,...,...,...
18417,Q02818,"[['golgi apparatus, cis-golgi network membrane...",MPPSGPRGTLLLLPLLLLLLLRAVLAVPLERGAPNKEETPATESPD...
18418,Q08752,"[['cytoplasm', 'nucleus, nucleolus', 'nucleus,...",MSHPSPQAKPSNPSNPRVFFDVDIGGERVGRIVLELFADIVPKTAE...
18419,Q13445,[['cell membrane ; single-pass type i membrane...,MMAAGAALALALWLLMPPVEVGGAGPPPIQDGEFTFLLPAGRKQCF...
18420,Q13889,"[['nucleus', 'nucleoplasm']]",MVSDEDELNLLVIVVDANPIWWGKQALKESQFTLSKCIDAVMVLGN...


In [8]:
#Subcellular localisation annotations
uniloc = {"Intracellular": [], "Extracellular": [], "Membrane": []}

for index, row in smallseqs.iterrows():
    if any(loc in row["Subcellular_location"] for loc in ["secreted", "secret", "extracellular"]):
        uniloc["Extracellular"].append(row["Uniprot"])
    else:
        if any(loc in row["Subcellular_location"] for loc in ["membrane", "wall", "lipid"]):
            uniloc["Membrane"].append(row["Uniprot"])
        else:
            uniloc["Intracellular"].append(row["Uniprot"])

#Remove duplicates
uniloc["Intracellular"] = list(set(uniloc["Intracellular"]))
uniloc["Extracellular"] = list(set(uniloc["Extracellular"]))
uniloc["Membrane"] = list(set(uniloc["Membrane"]))

#Add broad loc categories to df
smallseqs["Location"] = ""
for index, row in smallseqs.iterrows():
    for i in uniloc:
        if row["Uniprot"] in uniloc[i]:
            smallseqs.loc[index, "Location"] = i

for i in uniloc:
    print(i, len(set(uniloc[i])))    

print("\n", smallseqs.shape, len(uniloc["Intracellular"])+len(uniloc["Membrane"])+len(uniloc["Extracellular"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smallseqs["Location"] = ""


Intracellular 8203
Extracellular 2077
Membrane 7241

 (17521, 4) 17521


In [37]:
#Reset gpu resource usage
torch.cuda.empty_cache()
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

Allocated memory: 2927.60 MB
Cached memory: 3008.00 MB


In [18]:
querydf = smallseqs

# Load ESM-2 model
#https://github.com/facebookresearch/esm#available-models
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
#model, alphabet = esm.pretrained.esm2_t30_150M_UR50D() #smaller model
batch_converter = alphabet.get_batch_converter()

In [4]:
#Split to test + train datasets
from sklearn.model_selection import train_test_split
print(len(smallseqs["Sequence"].values), len(smallseqs["Uniprot"].values))
train_sequences, test_sequences, train_labels, test_labels = train_test_split(smallseqs["Sequence"].values, smallseqs["Uniprot"].values, test_size=0.25, shuffle=True)

#Prepare sequences for tokenisation
trainlist = []
for i, j in zip(train_labels, train_sequences):
    trainlist.append((i, j))
testlist = []
for i, j in zip(test_labels, test_sequences):
    testlist.append((i, j))

#Check test/train list match Uniprot <> seq labels
res = []
for i in trainlist+testlist:
    res.append(smallseqs.loc[smallseqs["Uniprot"] == i[0]]["Sequence"].values[0] == i[1])
if False in res:
    print("Mismatch!")
else:
    print("All matched!")

17521 17521
All matched!


In [37]:
#Batch submission
outdic = {}
embdic = {}
baduniprot = []

outdir = datadir+"/Seq_embeddings/"

if not os.path.exists(outdir):
    os.makedirs(outdir)

with torch.no_grad():
    #Load model to gpu
    if torch.cuda.is_available():
        model = model.cuda()
        
    batch_labels = None 
    batch_strs   = None
    batch_tokens = None
    
    start = time.time()
    #Tokenise + pad sequences
    batch_labels, batch_strs, batch_tokens = batch_converter(trainlist)
    
    #Dictionary for matching Uniprot <> Token
    comdic = dict(zip(batch_labels, batch_tokens))

    #Check output folder for precomputed embeddings
    #for i in batch_labels:
    #    if os.path.exists(f'{outdir+i}.pt') == True:
    #        print(comdic[i], batch_tokens)
            
            #print(i, os.path.exists(f'{outdir+i}.pt'), comdic[i])
    #        batch_labels.remove(i)
    #        batch_tokens.remove(comdic[i])
    #print(f"Time taken (Tokenisation) = {round(time.time()-start, 2)} seconds")
    
    #Split to managable batch 
    batch_size = 3
    #chunks = np.array_split(range(len(batch_tokens)), len(batch_tokens)/batch_size)
    num = 50
    chunks = np.array_split(range(num), num/batch_size)

    truestart = time.time()
    #Iterate over chunks of seqs
    for chunk in chunks:
        start = time.time()
        #Clear GPU memory
        out          = None
        batch_subset = None
        gc.collect()
        torch.cuda.empty_cache()
       # try:
        batch_subset = batch_tokens[chunk[0]:chunk[-1]].to(device="cuda", non_blocking=True)
        out = model(batch_subset, repr_layers=[33], return_contacts=False)["representations"][33]
        out = out.cpu()
        for i, j in zip(batch_labels[chunk[0]:chunk[-1]], out):
            torch.save(j, outdir+i+".pt")
      #  except:
        #    print(f"\nIssue! = {chunk, batch_labels[chunk[0]:chunk[-1]]}")
        #    print(f"Pre - Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        #    print(f"Pre - Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
        #    baduniprot.append(batch_labels[chunk[0]:chunk[-1]])
        
        print(f"Time taken (Embedding (batch {chunk[0]}-{chunk[-1]})) = {round(time.time()-start, 2)} seconds | Total ({round(time.time()-truestart, 2)})")

Time taken (Embedding (batch 0-3)) = 3.49 seconds | Total (3.49)
Time taken (Embedding (batch 4-7)) = 3.25 seconds | Total (6.74)
Time taken (Embedding (batch 8-10)) = 2.01 seconds | Total (8.75)
Time taken (Embedding (batch 11-13)) = 2.04 seconds | Total (10.79)
Time taken (Embedding (batch 14-16)) = 1.95 seconds | Total (12.74)
Time taken (Embedding (batch 17-19)) = 1.93 seconds | Total (14.67)
Time taken (Embedding (batch 20-22)) = 1.94 seconds | Total (16.62)
Time taken (Embedding (batch 23-25)) = 1.93 seconds | Total (18.55)
Time taken (Embedding (batch 26-28)) = 1.94 seconds | Total (20.5)
Time taken (Embedding (batch 29-31)) = 1.94 seconds | Total (22.44)
Time taken (Embedding (batch 32-34)) = 1.92 seconds | Total (24.36)
Time taken (Embedding (batch 35-37)) = 1.94 seconds | Total (26.3)
Time taken (Embedding (batch 38-40)) = 1.93 seconds | Total (28.23)
Time taken (Embedding (batch 41-43)) = 1.99 seconds | Total (30.23)
Time taken (Embedding (batch 44-46)) = 1.94 seconds | Tota

In [19]:
#CPU - single sequence save


#outdir = datadir+"/Seq_embeddings/"
#outdir = datadir+"/Seq_embeddings_2/"

def EmbeddingGeneration(inputlist, outdir, model):
    embdic = {}
    baduniprot = []
    
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    
    #Save list of uniprots for test / train set
   # if not os.path.exists(outdir+"train_labels.csv"):
    #    tmpdf = pd.DataFrame(train_labels) 
   #     tmpdf.to_csv(outdir+"train_labels.csv")
   # if not os.path.exists(outdir+"test_labels.csv"):
   #     tmpdf = pd.DataFrame(test_labels) 
   #     tmpdf.to_csv(outdir+"test_labels.csv")
    
    with torch.no_grad():
        #Load model to gpu
        if torch.cuda.is_available():
            model = model.cuda()
            
        batch_labels = None 
        batch_strs   = None
        batch_tokens = None
        
        start = time.time()
        #Tokenise + pad sequences
        batch_labels, batch_strs, batch_tokens = batch_converter(inputlist)
        #Dictionary for matching Uniprot <> Token
        comdic = dict(zip(batch_labels, batch_tokens))
    
        truestart = time.time()
        #Iterate over chunks of seqs
        for ent in range(len(batch_tokens)):
            start = time.time()
            #Clear GPU memory
            out          = None
            batch_subset = None
            gc.collect()
            torch.cuda.empty_cache()
            if os.path.exists(outdir+batch_labels[ent]+".pt") == False:
                try:
                    batch_subset = batch_tokens[ent].to(device="cuda", non_blocking=True)
                    out = model(batch_subset.unsqueeze(0), repr_layers=[33], return_contacts=False)["representations"][33]
                    out = out.cpu()
                    torch.save(out, outdir+batch_labels[ent]+".pt")
                except:
                    print(f"\nIssue! = {chunk, batch_labels[ent]}")
                    print(f"Pre - Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
                    print(f"Pre - Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
                    baduniprot.append(batch_labels[ent])
            else:
                print(f"Exists - {batch_labels[ent]+".pt"}")
            print(f"Time taken (Embedding (batch {ent})) = {round(time.time()-start, 2)} seconds \t| Total ({round(time.time()-truestart, 2)})")

    return embdic, baduniprot

testdic, baduniprot = EmbeddingGeneration(testlist, datadir+"/Seq_embed_test/", model)

Time taken (Embedding (batch 0)) = 1.09 seconds 	| Total (1.09)
Time taken (Embedding (batch 1)) = 1.08 seconds 	| Total (2.17)
Time taken (Embedding (batch 2)) = 1.08 seconds 	| Total (3.24)
Time taken (Embedding (batch 3)) = 1.08 seconds 	| Total (4.32)
Time taken (Embedding (batch 4)) = 1.09 seconds 	| Total (5.41)
Time taken (Embedding (batch 5)) = 1.08 seconds 	| Total (6.49)
Time taken (Embedding (batch 6)) = 1.09 seconds 	| Total (7.58)
Time taken (Embedding (batch 7)) = 1.09 seconds 	| Total (8.67)
Time taken (Embedding (batch 8)) = 1.08 seconds 	| Total (9.75)
Time taken (Embedding (batch 9)) = 1.09 seconds 	| Total (10.84)
Time taken (Embedding (batch 10)) = 1.11 seconds 	| Total (11.95)
Time taken (Embedding (batch 11)) = 1.08 seconds 	| Total (13.02)
Time taken (Embedding (batch 12)) = 1.1 seconds 	| Total (14.12)
Time taken (Embedding (batch 13)) = 1.1 seconds 	| Total (15.22)
Time taken (Embedding (batch 14)) = 1.08 seconds 	| Total (16.31)
Time taken (Embedding (batch 15

In [63]:
print(os.path.exists(outdir+batch_labels[ent]+".pt") == False)
print(batch_labels[ent])

True
Q9C0E2


In [8]:
print(batch_subset, batch_subset.ndim)
print(batch_subset.unsqueeze(0))

NameError: name 'batch_subset' is not defined

### Load data

In [10]:
embdir  = datadir+"/Seq_embeddings/"
testdir = datadir+"/Seq_embed_test/"
numlab  = {"Intracellular": 0, "Membrane": 1, "Extracellular": 2}


def loadEmbed(embdir, labs, maindf, limit):
    embdic = {}
    limit = limit
    count = 0
    for path, dirs, files in os.walk(embdir):
        for file in files:
            if file[-2:] == "pt" and count < limit:
                count += 1
                gc.collect()
                torch.cuda.empty_cache()
                lab = labs[maindf.loc[maindf["Uniprot"] == file[:-3]]["Location"].values[0]]
                print(file, f"\t|\t{count}")
                embdic.setdefault(file[:-3],  [lab, torch.load(embdir+file)])
    return embdic

start = time.time()
embdic = loadEmbed(embdir, numlab, smallseqs, 40)#, 4000)
print(f"Time taken (Load train embeddings) = {round(time.time()-start, 2)} seconds")
start = time.time()
testdic = loadEmbed(testdir, numlab, smallseqs, 20)#, 200)
print(f"Time taken (Load train embeddings) = {round(time.time()-start, 2)} seconds")

Q8N5W9.pt 	|	1
O14529.pt 	|	2
O15321.pt 	|	3
P35790.pt 	|	4
P13686.pt 	|	5


  embdic.setdefault(file[:-3],  [lab, torch.load(embdir+file)])


Q5QP82.pt 	|	6
Q9NU53.pt 	|	7
P26717.pt 	|	8
Q9Y573.pt 	|	9
P78556.pt 	|	10
P21580.pt 	|	11
Q7RTW8.pt 	|	12
Q8IUX7.pt 	|	13
Q9NWR8.pt 	|	14
Q8WUP2.pt 	|	15
Q9BZI1.pt 	|	16
P58511.pt 	|	17
Q08881.pt 	|	18
Q9Y5W5.pt 	|	19
Q7Z6J8.pt 	|	20
O60404.pt 	|	21
Q9NQZ5.pt 	|	22
Q969X2.pt 	|	23
Q6UXI9.pt 	|	24
Q8TE23.pt 	|	25
Q9Y2E6.pt 	|	26
A0A075B6I3.pt 	|	27
O95696.pt 	|	28
Q86V85.pt 	|	29
Q8WVP7.pt 	|	30
O43405.pt 	|	31
A0A087WTH1.pt 	|	32
Q7Z6J4.pt 	|	33
A6NHN6.pt 	|	34
Q9NYB5.pt 	|	35
Q9NYZ1.pt 	|	36
Q96DL1.pt 	|	37
Q96JX3.pt 	|	38
P0CG21.pt 	|	39
Q9BYX2.pt 	|	40
Time taken (Load train embeddings) = 1.83 seconds
Q8NGC4.pt 	|	1
Q9H4B4.pt 	|	2
O95716.pt 	|	3
Q9UIC8.pt 	|	4
Q9UMQ3.pt 	|	5
P01705.pt 	|	6
O75795.pt 	|	7
Q9UNW8.pt 	|	8
Q96ND0.pt 	|	9
Q494V2.pt 	|	10
O95992.pt 	|	11
Q9UI26.pt 	|	12
Q9NUQ9.pt 	|	13
Q1ED39.pt 	|	14
O60294.pt 	|	15
H7C350.pt 	|	16
Q16619.pt 	|	17
Q5T6X5.pt 	|	18
Q9BV87.pt 	|	19
A1A4G5.pt 	|	20
Time taken (Load train embeddings) = 13.03 seconds


In [12]:
embdic.keys()
embdic['O14529']

[0,
 tensor([[[ 0.0396,  0.0261,  0.0886,  ..., -0.2584,  0.1620,  0.0385],
          [ 0.0317, -0.0557, -0.0805,  ..., -0.0737, -0.0290,  0.0761],
          [-0.1347,  0.2843, -0.1159,  ..., -0.0779,  0.2602,  0.1921],
          ...,
          [-0.0603,  0.0413,  0.0062,  ..., -0.0508,  0.1348, -0.0044],
          [-0.0685,  0.0388, -0.0077,  ..., -0.0398,  0.1560, -0.0023],
          [-0.0468,  0.0337,  0.0079,  ..., -0.0521,  0.1448, -0.0278]]])]

In [44]:
import torch.nn as nn

class SequenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(SequenceClassifier, self).__init__()
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        #print(x)
        _, (hidden, _) = self.rnn(x)  # hidden: [1, batch_size, hidden_dim]
        hidden = hidden[-1]#.squeeze(0)    # Remove sequence dimension
        output = self.fc(hidden)
        return output


In [50]:
#Initialize Model, Loss, and Optimizer
embedding_dim = 1280
hidden_dim = 512
num_classes = 3
model = SequenceClassifier(embedding_dim, hidden_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

start = time.time()
#Training Loop with Debug Prints
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    for i in embdic:
        labels, embeddings = embdic[i]
        labels = torch.tensor([labels])
        optimizer.zero_grad()   
        outputs = model(embeddings)  # Forward pass
        loss = criterion(outputs, labels)
        print(f"{i} - Loss: {loss.item()}")  # Print loss
        loss.backward()
        optimizer.step()

print(f"Time taken (Load embeddings) = {round(time.time()-start, 2)} seconds")

Epoch 1/5
Q8N5W9 - Loss: 1.2564401626586914
O14529 - Loss: 1.3772069215774536
O15321 - Loss: 0.6901451349258423
P35790 - Loss: 0.8809043169021606
P13686 - Loss: 1.034956455230713
Q5QP82 - Loss: 0.5012123584747314
Q9NU53 - Loss: 0.9582467675209045
P26717 - Loss: 1.3644249439239502
Q9Y573 - Loss: 0.31995904445648193
P78556 - Loss: 2.959141254425049
P21580 - Loss: 0.12439962476491928
Q7RTW8 - Loss: 3.370375633239746
Q8IUX7 - Loss: 2.9179482460021973
Q9NWR8 - Loss: 1.2856578826904297
Q8WUP2 - Loss: 0.5502856969833374
Q9BZI1 - Loss: 0.7711750864982605
P58511 - Loss: 1.142417073249817
Q08881 - Loss: 0.9916801452636719
Q9Y5W5 - Loss: 1.5158343315124512
Q7Z6J8 - Loss: 0.3830983340740204
O60404 - Loss: 1.063103199005127
Q9NQZ5 - Loss: 0.7091901898384094
Q969X2 - Loss: 1.0196424722671509
Q6UXI9 - Loss: 1.1442625522613525
Q8TE23 - Loss: 0.8505480885505676
Q9Y2E6 - Loss: 1.188887357711792
A0A075B6I3 - Loss: 0.8265051245689392
O95696 - Loss: 1.233799695968628
Q86V85 - Loss: 1.0299988985061646
Q8WVP

In [53]:
subnum = {0: "Intracellular", 1: "Membrane", 2: "Extracellular"}

model.eval()
good = 0
bad = 0
start = time.time()
with torch.no_grad():
    for embd in testdic:
        test_embedding = testdic[embd][1]#.transpose(0, 1).unsqueeze(0)
        prediction = model(test_embedding)
        predicted_index = torch.argmax(prediction, dim=1).item()
        if subnum[predicted_index] == smallseqs.loc[smallseqs["Uniprot"] == embd]["Location"].values:
            good += 1
        else:
            bad +=1
        print(f"Prediction logits: {prediction}")
        print(f"Predicted subcellular localization for {embd}: {subnum[predicted_index]} \t| True = {smallseqs.loc[smallseqs["Uniprot"] == embd]["Location"].values[0]}")
        print(f"Correct rate = {good/(good+bad)*100}%")
print("\n", good, bad, f"Accuracy = {good/(good+bad)*100}%")
print(f"Time taken (Test model) = {round(time.time()-start, 2)} seconds")

Prediction logits: tensor([[-6.2714,  5.7574, -4.2724]])
Predicted subcellular localization for Q8NGC4: Membrane | True = Membrane
Correct rate = 100.0%
Prediction logits: tensor([[ 0.7827, -0.0549, -2.7895]])
Predicted subcellular localization for Q9H4B4: Intracellular | True = Intracellular
Correct rate = 100.0%
Prediction logits: tensor([[-1.4545,  1.3250, -1.3434]])
Predicted subcellular localization for O95716: Membrane | True = Membrane
Correct rate = 100.0%
Prediction logits: tensor([[ 1.3521, -0.2147, -2.9878]])
Predicted subcellular localization for Q9UIC8: Intracellular | True = Intracellular
Correct rate = 100.0%
Prediction logits: tensor([[ 2.6151, -0.9609, -4.1866]])
Predicted subcellular localization for Q9UMQ3: Intracellular | True = Intracellular
Correct rate = 100.0%
Prediction logits: tensor([[-2.9832,  1.4086,  0.5296]])
Predicted subcellular localization for P01705: Membrane | True = Extracellular
Correct rate = 83.33333333333334%
Prediction logits: tensor([[-4.4693

In [40]:
subnum = {0: "Intracellular", 1: "Membrane", 2: "Extracellular"}
count = 1
good = 0
bad = 0


with torch.no_grad():
    for index, row in unusedf[0:5].iterrows():
        model.eval()
        count += 1
        test_file = embdir+row["Uniprot"]+".pt"
        #test_path = os.path.join(embedding_dir, test_file)
        test_embedding = torch.load(test_file)#.unsqueeze(0)  # Add batch dimension
        test_embedding = test_embedding.transpose(0, 1).unsqueeze(0)
        print(test_embedding)
        
       # print(f"\nTest input shape: {test_embedding.shape}")
        prediction = model(test_embedding)
        predicted_index = torch.argmax(prediction, dim=1).item()
       # predicted_label = dataset.index_to_label[predicted_index]
        print(f"\nCur prot = {row["Uniprot"]} |\t {count} |\t {test_file}")
        print(f"Prediction logits: {prediction}")
        print(f"Predicted subcellular localization: {subnum[predicted_index]}")
        if subnum[predicted_index] == row["Location"]:
            good += 1
        else:
            bad +=1
print(f"Total = {unusedf.shape} \n\tGood = {good} \n\tBad = {bad}")

  test_embedding = torch.load(test_file)#.unsqueeze(0)  # Add batch dimension


tensor([[[ 0.0078,  0.0646,  0.0008,  ..., -0.0576,  0.0094, -0.0313],
         [-0.0509, -0.1000,  0.1732,  ..., -0.1109, -0.0745, -0.0571],
         [ 0.0310, -0.1329, -0.0325,  ..., -0.0802, -0.1086, -0.1992],
         ...,
         [-0.2478,  0.1345,  0.0648,  ...,  0.0488, -0.0058, -0.0426],
         [ 0.1100, -0.1011,  0.0558,  ..., -0.0176, -0.0196, -0.0504],
         [ 0.0149,  0.1096, -0.0737,  ..., -0.1707, -0.1174, -0.0382]]])

Cur prot = A2RRL7 |	 2 |	 /home/louie/Projects/Personal/GraphDB/Data/Seq_embeddings/A2RRL7.pt
Prediction logits: tensor([[ 0.5989,  0.4953, -0.9064]])
Predicted subcellular localization: Intracellular
tensor([[[ 0.0078,  0.0646,  0.0008,  ..., -0.0576,  0.0094, -0.0313],
         [-0.0509, -0.1000,  0.1732,  ..., -0.1109, -0.0745, -0.0571],
         [ 0.0310, -0.1329, -0.0325,  ..., -0.0802, -0.1086, -0.1992],
         ...,
         [-0.2478,  0.1345,  0.0648,  ...,  0.0488, -0.0058, -0.0426],
         [ 0.1100, -0.1011,  0.0558,  ..., -0.0176, -0.019

In [34]:
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i in embdic:
        label, embedding = embdic[i]
        print(i, label)
        optimizer.zero_grad()
        outputs = model(embedding)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Q8N5W9 1


RuntimeError: input.size(-1) must be equal to input_size. Expected 1502, got 1280

### Old code

In [54]:
count = 0
for i in batch_labels:
    if os.path.exists(f'{outdir+i}.pt') == True:
        print(i, os.path.exists(f'{outdir+i}.pt'), comdic[i])
        print(batch_tokens[comdic[i].nonzero(as_tuple=True)])

Q9UBM4 True tensor([ 0, 20, 10,  ...,  1,  1,  1])
tensor([[ 0, 20, 10,  ...,  1,  1,  1],
        [ 0, 20,  9,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        ...,
        [ 0, 20, 13,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        [ 0, 20,  8,  ...,  1,  1,  1]])
Q8N8Y5 True tensor([ 0, 20,  9,  ...,  1,  1,  1])
tensor([[ 0, 20, 10,  ...,  1,  1,  1],
        [ 0, 20,  9,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        ...,
        [ 0, 20, 13,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        [ 0, 20,  8,  ...,  1,  1,  1]])
Q86XA0 True tensor([ 0, 20, 19,  ...,  1,  1,  1])
tensor([[ 0, 20, 10,  ...,  1,  1,  1],
        [ 0, 20,  9,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        ...,
        [ 0, 20, 13,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        [ 0, 20,  8,  ...,  1,  1,  1]])
Q6ZMV9 True tensor([ 0, 20,  7,  ...,  1,  1,  1])
tensor([[ 0, 20, 10,  ...,  1,  1,

# Subcellular localisation prediction example

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
'''
uniprot_proteomics_dataset.csv

protein_id,sequence_embedding,subcellular_location
P12345,"[0.1, 0.2, 0.3, 0.4, 0.5]",cytoplasm
Q67890,"[0.5, 0.4, 0.3, 0.2, 0.1]",nucleus
A11223,"[0.2, 0.3, 0.4, 0.5, 0.6]",mitochondrion
B33445,"[0.3, 0.2, 0.1, 0.4, 0.5]",plasma membrane
C55678,"[0.6, 0.5, 0.4, 0.3, 0.2]",extracellular region
'''

In [None]:
#Step 2: Load and Preprocess the Dataset
# Load dataset
data = pd.read_csv("uniprot_proteomics_dataset.csv")

# Inspect dataset
print(data.head())

# Extract relevant features and labels
sequence_features = np.array(data["sequence_embedding"].apply(eval).tolist())  # Assuming embeddings are precomputed
labels = data["subcellular_location"]

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(sequence_features, encoded_labels, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
#Step 3: Define the Neural Network
class ProteinClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ProteinClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
#Step 4: Train the Model
# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
num_classes = len(label_encoder.classes_)
model = ProteinClassifier(input_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    num_batches = len(X_train_tensor) // batch_size

    for i in range(0, len(X_train_tensor), batch_size):
        # Get batch
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/num_batches:.4f}")


In [None]:
#Step 5: Evaluate the Model
# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
#Step 6: Save and Load the Model
# Save the model
torch.save(model.state_dict(), "protein_classifier.pth")

# Load the model
loaded_model = ProteinClassifier(input_dim, num_classes)
loaded_model.load_state_dict(torch.load("protein_classifier.pth"))
loaded_model.eval()

# Tutorials

## Resources

In [None]:
Git
#https://github.com/liyu95/Deep_learning_examples/tree/master
#https://github.com/Moeinh77/blog_posts_codes/blob/main/pfam_esm.ipynb

Papers
#https://www.nature.com/articles/s41598-022-12201-9 | Prediction of protein–protein interaction using graph neural networks

Other
#https://medium.com/@moeinh77/protein-sequence-analyses-with-transformers-using-pytorch-and-huggingface-11a478bcc602
#https://huggingface.co/blog/AmelieSchreiber/protein-binding-partners-with-esm2

## Quickstart - PyTorch

In [None]:
#https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

In [None]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [None]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

In [None]:
tensor = torch.ones(4, 4)
tensor[:,1] = 0
print(tensor)
print(torch.cat([tensor, tensor, tensor], dim=0))

In [None]:
labels_map = {
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()