In [2]:
import esm
import torch
import time
import gc
import os
%run "../scripts/data_processing.py"
%run "../scripts/node_edge_generation.py"
%run "../scripts/graph_functions.py"

#Data path
datadir = os.path.abspath("../../Data/")

### Data retrieval

In [5]:
#Start graph
startGraph("./pass_ent.txt", "./Results/")

#Submit query, retrieve sequences with direct cutaneous leishmaniasis associations
uri = "bolt://localhost:7687" 
driver = GraphDatabase.driver(uri)

query_1 = [f'match (p:Protein)-[di:direct_evidence]-(d:Disease)\n \
            where  d.name contains "cutaneous Leishmaniasis" \
            return p.uniprot as Uniprot, p.name as Prot_nam, toFloat(di.score) as Score, p.seq as Sequence order by Score desc']

results_1 = run_query(query_1[0], driver)
display(results_1)

#Close connection
driver.close()

#Create subset with smaller sequences
smallseqs = results_1.loc[results_1["Sequence"].str.len() <= 1500]

Graph stopped
Database import successful!
Graph started


Unnamed: 0,Uniprot,Prot_nam,Score,Sequence
0,P01611,Immunoglobulin kappa variable 1D-12,0.586821,MDMMVPAQLLGLLLLWFPGSRCDIQMTQSPSSVSASVGDRVTITCR...
1,P01709,Immunoglobulin lambda variable 2-8,0.586821,MAWALLLLTLLTQGTGSWAQSALTQPPSASGSPGQSVTISCTGTSS...
2,P01814,Immunoglobulin heavy variable 2-70,0.586821,MDILCSTLLLLTVPSWVLSQVTLRESGPALVKPTQTLTLTCTFSGF...
3,P01768,Immunoglobulin heavy variable 3-30,0.586821,MEFGLSWVFLVALLRGVQCQVQLVESGGGVVQPGRSLRLSCAASGF...
4,A2NJV5,Immunoglobulin kappa variable 2-29,0.586821,MRLPAQLLGLLMLWIPGSSADIVMTQTPLSLSVTPGQPASISCKSS...
...,...,...,...,...
244,P11912,B-cell antigen receptor complex-associated pro...,0.001478,MPGGPGVLQALPATIFLLFLLSAVYLGPGCQALWMHKVPASLMVSL...
245,P27361,Mitogen-activated protein kinase 3,0.001478,MAAAAAQGGGGGEPRRTEGVGPGVPGEVEMVKGQPFDVGPRYTQLQ...
246,P29965,CD40 ligand,0.001478,MIETYNQTSPRSAATGLPISMKIFMYLLTVFLITQMIGSALFAVYL...
247,P35228,"Nitric oxide synthase, inducible",0.001478,MACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQY...


### Data preparation

In [4]:
#Reset gpu resource usage
torch.cuda.empty_cache()
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

querydf = smallseqs

# Load ESM-2 model
#https://github.com/facebookresearch/esm#available-models
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()

#Split to test + train datasets
from sklearn.model_selection import train_test_split
print(len(smallseqs["Sequence"].values), len(smallseqs["Uniprot"].values))
train_sequences, test_sequences, train_labels, test_labels = train_test_split(smallseqs["Sequence"].values, smallseqs["Uniprot"].values, test_size=0.25, shuffle=True)

#Prepare sequences for tokenisation
trainlist = []
for i, j in zip(train_labels, train_sequences):
    trainlist.append((i, j))
testlist = []
for i, j in zip(test_labels, test_sequences):
    testlist.append((i, j))

#Check test/train list match Uniprot <> seq labels
res = []
for i in trainlist+testlist:
    res.append(smallseqs.loc[smallseqs["Uniprot"] == i[0]]["Sequence"].values[0] == i[1])
if False in res:
    print("Mismatch!")
else:
    print("All matched!")

Allocated memory: 0.00 MB
Cached memory: 0.00 MB
243 243
All matched!


### Sequence embedding

In [37]:
#CPU - single sequence save
def EmbeddingGeneration(inputlist, outdir, model):
    embdic = {}
    baduniprot = []
    
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    
    with torch.no_grad():
        #Load model to gpu
        if torch.cuda.is_available():
            model = model.cuda()
            
        batch_labels = None 
        batch_strs   = None
        batch_tokens = None
        
        start = time.time()
        #Tokenise + pad sequences
        batch_labels, batch_strs, batch_tokens = batch_converter(inputlist)
        #Dictionary for matching Uniprot <> Token
        comdic = dict(zip(batch_labels, batch_tokens))
        truestart = time.time()
        #Iterate over chunks of seqs
        for ent in range(len(batch_tokens)):
            start = time.time()
            #Clear GPU memory
            out          = None
            batch_subset = None
            gc.collect()
            torch.cuda.empty_cache()
            #if os.path.exists(outdir+batch_labels[ent]+".pt") == False:
            try:
                batch_subset = batch_tokens[ent].to(device="cuda", non_blocking=True)
                out = model(batch_subset.unsqueeze(0), repr_layers=[33], return_contacts=False)["representations"][33]
                out = out.cpu()
                torch.save(out, outdir+batch_labels[ent]+".pt")
                embdic.setdefault(batch_labels[ent], out)
            except:
                print(f"\nIssue! = {chunk, batch_labels[ent]}")
                print(f"Pre - Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
                print(f"Pre - Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
                baduniprot.append(batch_labels[ent])
          #  else:
                print(f"Exists - {batch_labels[ent]+".pt"}")
            print(f"Time taken (Embedding (batch {ent})) = {round(time.time()-start, 2)} seconds \t| Total ({round(time.time()-truestart, 2)})")

    return embdic, baduniprot

testdic, baduniprot = EmbeddingGeneration(testlist, datadir+"/Leishmaniasis_embed/test/", model)
traindic, baduniprot = EmbeddingGeneration(trainlist, datadir+"/Leishmaniasis_embed/train/", model)

for i in testdic:
    testdic[i] = [float(round(smallseqs.loc[smallseqs["Uniprot"] == i]["Score"].values[0], 3)), testdic[i]]

for i in traindic:
    traindic[i] = [float(round(smallseqs.loc[smallseqs["Uniprot"] == i]["Score"].values[0], 3)), traindic[i]]

print(len(testdic), len(traindic))

Time taken (Embedding (batch 0)) = 0.79 seconds 	| Total (0.79)
Time taken (Embedding (batch 1)) = 0.77 seconds 	| Total (1.56)
Time taken (Embedding (batch 2)) = 0.75 seconds 	| Total (2.31)
Time taken (Embedding (batch 3)) = 0.75 seconds 	| Total (3.06)
Time taken (Embedding (batch 4)) = 0.75 seconds 	| Total (3.82)
Time taken (Embedding (batch 5)) = 0.75 seconds 	| Total (4.57)
Time taken (Embedding (batch 6)) = 0.76 seconds 	| Total (5.32)
Time taken (Embedding (batch 7)) = 0.79 seconds 	| Total (6.11)
Time taken (Embedding (batch 8)) = 0.75 seconds 	| Total (6.86)
Time taken (Embedding (batch 9)) = 0.75 seconds 	| Total (7.61)
Time taken (Embedding (batch 10)) = 0.78 seconds 	| Total (8.39)
Time taken (Embedding (batch 11)) = 0.76 seconds 	| Total (9.14)
Time taken (Embedding (batch 12)) = 0.75 seconds 	| Total (9.89)
Time taken (Embedding (batch 13)) = 0.75 seconds 	| Total (10.64)
Time taken (Embedding (batch 14)) = 0.76 seconds 	| Total (11.4)
Time taken (Embedding (batch 15)) 

In [36]:
float(np.float64(0.001))

0.001

In [38]:
print(traindic["Q8N8Y2"])

[0.001, tensor([[[ 0.0351, -0.0081,  0.0275,  ..., -0.1981,  0.0471, -0.0036],
         [ 0.1278, -0.0170, -0.0547,  ...,  0.2933,  0.0029,  0.1559],
         [ 0.0441, -0.1781,  0.1449,  ...,  0.1955,  0.2921,  0.3168],
         ...,
         [ 0.0350,  0.1062,  0.0308,  ..., -0.2161,  0.1026,  0.0445],
         [ 0.0320,  0.1028,  0.0235,  ..., -0.2155,  0.0947,  0.0249],
         [ 0.0240,  0.0859,  0.0280,  ..., -0.2171,  0.1003,  0.0267]]])]


In [43]:
allflot = []
for i in traindic:
    if traindic[i][0] not in allflot:
        allflot.append(traindic[i][0])

print(len(allflot))

23


### ML analysis

In [39]:
import torch.nn as nn

class SequenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(SequenceClassifier, self).__init__()
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        #print(x)
        _, (hidden, _) = self.rnn(x) 
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [44]:
#Initialize Model, Loss, and Optimizer
embedding_dim = 1280
hidden_dim = 512
num_classes = 23
model = SequenceClassifier(embedding_dim, hidden_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

start = time.time()
#Training Loop with Debug Prints
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    for i in traindic:
        labels, embeddings = traindic[i]
        labels = torch.tensor([labels])
        optimizer.zero_grad()   
        outputs = model(embeddings)  # Forward pass
        loss = criterion(outputs, labels)
        print(f"{i} - Loss: {loss.item()}")  # Print loss
        loss.backward()
        optimizer.step()

print(f"Time taken (Load embeddings) = {round(time.time()-start, 2)} seconds")

Epoch 1/5


RuntimeError: expected scalar type Long but found Float

In [31]:
testdic

{'P01303': [np.float64(0.002),
  tensor([[[ 0.0519,  0.0064,  0.0787,  ..., -0.2606,  0.1738, -0.0135],
           [-0.0059,  0.0888, -0.0782,  ...,  0.1215, -0.0497, -0.0010],
           [-0.1509,  0.1993,  0.2442,  ...,  0.0825,  0.2186, -0.0241],
           ...,
           [ 0.0366, -0.0666,  0.1165,  ..., -0.2293,  0.1767,  0.0809],
           [ 0.0415, -0.0719,  0.1119,  ..., -0.2167,  0.1765,  0.0868],
           [ 0.0560, -0.0870,  0.1015,  ..., -0.1909,  0.2290,  0.1087]]])],
 'P01825': [np.float64(0.587),
  tensor([[[ 0.0619, -0.0027,  0.0128,  ..., -0.2464,  0.1543,  0.0157],
           [ 0.0181,  0.1384, -0.0895,  ...,  0.1380,  0.0264,  0.1064],
           [-0.1832,  0.1850, -0.2596,  ...,  0.0671,  0.0291, -0.1657],
           ...,
           [-0.1874, -0.4130, -0.0949,  ..., -0.2987, -0.0400,  0.0886],
           [-0.1365, -0.3596,  0.0068,  ..., -0.3444, -0.0401,  0.0616],
           [-0.0426, -0.2834,  0.0861,  ..., -0.1982, -0.0037,  0.1315]]])],
 'P01601': [np.float64