In [1]:
import import_ipynb

#import model
%cd ..
%cd "protein-ppi-encoding-module"
from protein2vec import *

#import dataset functions
%cd ..
%cd "datasets"
from dataset_manip import *
%cd ..

%cd "training-testing"
from training_helper import * 
%cd ..

import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import time
from torch.utils.tensorboard import SummaryWriter
import random
from torch.optim import lr_scheduler
from sklearn.metrics import roc_auc_score
import pickle
from random import shuffle

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

C:\Users\Ieremie\Desktop\TransformerGO
C:\Users\Ieremie\Desktop\TransformerGO\protein-ppi-encoding-module
importing Jupyter notebook from protein2vec.ipynb
C:\Users\Ieremie\Desktop\TransformerGO
C:\Users\Ieremie\Desktop\TransformerGO\datasets
importing Jupyter notebook from dataset_manip.ipynb
C:\Users\Ieremie\Desktop\TransformerGO
C:\Users\Ieremie\Desktop\TransformerGO\training-testing
importing Jupyter notebook from training_helper.ipynb
C:\Users\Ieremie\Desktop\TransformerGO


In [2]:
neg_path = "datasets/jains-TCSS-datasets/yeast_data/iea+/negatives.sgd.iea.f"
poz_path = "datasets/jains-TCSS-datasets/yeast_data/iea+/positives.sgd.iea.f"

go_embed_pth = "term-encoding-module/emb/go-terms-128.emd"
go_id_dict_pth = "term-encoding-module/go_id_dict"
protein_go_anno_pth = "datasets/jains-TCSS-datasets/yeast_data/gene_association.sgd"

In [None]:
train_set, valid_set, test_set, full_dataset = get_dataset_split_stringDB(poz_path, neg_path, protein_go_anno_pth, go_id_dict_pth, go_embed_pth, shuffle, ratio = [0.8, 0.2, 0],  stringDB = False)

MAX_LEN_SEQ = get_max_len_seq(full_dataset)
def helper_collate(batch):
    return batch_padding_collate_fn(batch, MAX_LEN_SEQ,  emb_dim = 128, global_padd = False)

params = {'batch_size': 16,'collate_fn': helper_collate}
train_grt = data.DataLoader(train_set, **params, shuffle = True)
val_grt = data.DataLoader(valid_set, **params, shuffle = True)
test_grt = data.DataLoader(test_set, **params, shuffle = False)

Rejected interactions where at least one protein has no annotation:  0
Rejected interactions where go_filter=ALL and intr_set_size_filter=[0, 500]:  0
Number of interactions: 3858
Rejected interactions where at least one protein has no annotation:  0
Rejected interactions where go_filter=ALL and intr_set_size_filter=[0, 500]:  0
Number of interactions: 3858


In [4]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    pred = []
    lab = []
    for batch in iterator:
        optimizer.zero_grad()
        
        #batch tensor of shape N * 2(protein pair) * L(longest seq) * Emb dim
        padded_pairs = batch[0].to(device)
        labels = batch[1].to(device)
        prots_A_len = batch[2]
        prots_B_len = batch[3]
        predictions = model(padded_pairs[:,0], padded_pairs[:,1], prots_A_len, prots_B_len).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        pred = pred + list(predictions.cpu().data.numpy())
        lab = lab + list(labels.cpu().data.numpy())
 
    return epoch_loss / len(iterator), epoch_acc / len(iterator), roc_auc_score(lab,pred)

In [5]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    pred = []
    lab = []
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            
            padded_pairs = batch[0].to(device)
            labels = batch[1].to(device)
            prots_A_len = batch[2]
            prots_B_len = batch[3]
            
            predictions = model(padded_pairs[:,0], padded_pairs[:,1], prots_A_len, prots_B_len).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            pred = pred + list(predictions.cpu().data.numpy())
            lab = lab + list(labels.cpu().data.numpy())

    return epoch_loss / len(iterator), epoch_acc / len(iterator), roc_auc_score(lab,pred)


In [6]:
N_EPOCHS = 50
DROPOUT = 0.2
LR = 0.0005
INPUT_DIM = 128  #node2vec embbedings
HIDDEN_DIM = 64
F_1 = 64
F_2 = 16
F_3 = 8
F_4 = 1

model = PROTEIN2VEC_SHARED(INPUT_DIM, HIDDEN_DIM, F_1, F_2, F_3, F_4, DROPOUT)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss().to(device)

In [7]:
writer = SummaryWriter(flush_secs=14)
N_EPOCHS = 50
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc, roc_train = train(model, train_grt, optimizer, criterion)
    valid_loss, valid_acc, roc_val = evaluate(model, val_grt, criterion)   
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(),  'model.pt')

    print_status(epoch, epoch_mins, epoch_secs, train_loss,\
                 train_acc, valid_loss, valid_acc, roc_train, roc_val, optimizer)
    write_scalars_tensorboard(writer, train_loss, valid_loss, train_acc, valid_acc, epoch)
    

Epoch: 50 | Epoch Time: 0m 14s 	Train Loss: 0.227 | Train Acc: 91.16% 	 Val. Loss: 0.373 |  Val. Acc: 86.24% 	 Roc Train: 0.968 	 Roc Valid: 0.931 ,   0.0005 --LR

In [None]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc = evaluate(model, test_grt, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
C_FOLD = 5
N_EPOCHS = 50
DROPOUT = 0.2
LR = 0.0005
INPUT_DIM = 128  #node2vec embbedings
HIDDEN_DIM = 64
F_1 = 64
F_2 = 16
F_3 = 8
F_4 = 1

sz = len(full_dataset)
fold_size = int(sz/C_FOLD)
l = 0
r = fold_size
indexes = np.arange(sz)

val_accs = []
val_rocs = []
wrong_eval = []
wrong_eval_labels = []
for i in range(0, C_FOLD):
    print("Fold nr: ", i, end='\r')
    
    model = PROTEIN2VEC_SHARED(INPUT_DIM, HIDDEN_DIM, F_1, F_2, F_3, F_4, DROPOUT)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCEWithLogitsLoss().to(device)
    writer = SummaryWriter(flush_secs=14)
    
    val_subset = data.Subset(full_dataset, indexes[l:r])
    c_val_grt = data.DataLoader(val_subset, **params, shuffle = False)
    
    train_subset = data.Subset(full_dataset, np.concatenate([indexes[0:l], indexes[r:sz]]))
    c_train_grt = data.DataLoader(train_subset, **params, shuffle = True)
    
    l += fold_size
    r += fold_size

    best_valid_roc = float('-inf')
    best_valid_acc = float('-inf')
    temp_w_eval = []
    for epoch in range(N_EPOCHS):

        start_time = time.time()
        train_loss, train_acc, roc_train = train(model, train_grt, optimizer, criterion)
        valid_loss, valid_acc, roc_val = evaluate(model, val_grt, criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
        print_status(epoch, epoch_mins, epoch_secs, train_loss,\
                 train_acc, valid_loss, valid_acc, roc_train, roc_val, optimizer)
        write_scalars_tensorboard(writer, train_loss, valid_loss, train_acc, valid_acc, epoch)
        
        best_valid_roc = max(best_valid_roc, roc_val)
        best_valid_acc = max(best_valid_acc, valid_acc)
    
    val_rocs.append(best_valid_roc)
    val_accs.append(best_valid_acc)

Epoch: 03 | Epoch Time: 0m 26s 	Train Loss: 0.499 | Train Acc: 78.48% 	 Val. Loss: 0.468 |  Val. Acc: 79.48% 	 Roc Train: 0.837 	 Roc Valid: 0.877 ,   0.0005 --LR

In [61]:
#saving the 5 cross validation results
with open("5cv_roc_" + neg_path[-24:][:4] + neg_path[-5:] + '.pkl', "wb") as fp:
    pickle.dump(val_rocs, fp)
with open("5cv_acc_" + neg_path[-24:][:4] + neg_path[-5:] + '.pkl', "wb") as fp:
    pickle.dump(val_accs, fp)

C:\Users\Ieremie\Desktop\ppi-phd\experiments-results
