In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import math
import os

from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, decoders, trainers, processors

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from torchvision import transforms
from torchmetrics import AUROC, Accuracy
from sklearn.metrics import confusion_matrix
from torch.optim.lr_scheduler import LambdaLR
from sklearn.model_selection import train_test_split

from pytorch_lightning import Trainer
from pytorch_lightning.tuner.tuning import Tuner 
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping, StochasticWeightAveraging
from pytorch_lightning.loggers.neptune import NeptuneLogger
import neptune.new as neptune

from data_processing.utils import *

In [2]:
class Rbp24Dataset(Dataset):

    def __init__(self, df, tokenizer, longest_seq, transform=None):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.longest = longest_seq

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        df = self.df

        seq = df['seq'][idx].lower()
        label = df['label'][idx]

        tokenized_seq = self.tokenizer.encode(seq).ids
        padded_seq = np.pad(tokenized_seq, (0, self.longest - len(tokenized_seq)))

        sample = {'seq':padded_seq, 'label':label}

        if self.transform:
            sample = self.transform(sample)
        
        return sample

class ToTensor(object):
    "Convert both seq and label to Tensors"

    def __call__(self, sample):

        seq, label = sample['seq'], sample['label']

        sample = {'seq': torch.tensor(seq, dtype=torch.long),
                  'label': torch.tensor(label)}
        
        return sample


def collate_predict(batch):
    return [item['seq'] for item in batch]
        

In [3]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiheadAttention(pl.LightningModule):

    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Stack all weight matrices 1...h together for efficiency
        # Note that in many implementations you see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3*embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim//2)

        self._reset_parameters()

    def _reset_parameters(self):
        # Original Transformer initialization, see PyTorch documentation
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_proj(x)

        # Separate Q, K, V from linear output
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)

        # Determine value outputs
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [4]:
class AttnCNN(pl.LightningModule):

    def __init__(self, config):
        super(AttnCNN, self).__init__() 

        self.save_hyperparameters(config)

        self.learning_rate = self.hparams.learning_rate
        self.decay_factor = self.hparams.decay_factor
        self.batch_size = self.hparams.batch_size
        
        self.auroc = AUROC(num_classes=1)
        self.acc = Accuracy()

        # Emmbedding
        self.embedd = nn.Embedding(self.hparams.vocab_size, self.hparams.embedding_dim, padding_idx=0)

        # Convolutional blocks
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=self.hparams.embedding_dim, out_channels=self.hparams.CONV1_out_channels, kernel_size=self.hparams.CONV_kernelsize, padding=1),
            nn.BatchNorm1d(self.hparams.CONV1_out_channels),
            nn.ReLU(), 
            nn.Dropout(0.3))
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=self.hparams.CONV1_out_channels, out_channels=self.hparams.CONV2_out_channels, kernel_size=self.hparams.CONV_kernelsize, padding=1),
            nn.BatchNorm1d(self.hparams.CONV2_out_channels),
            nn.ReLU(),
            nn.Dropout(0.3))
        
        self.pool = nn.MaxPool1d(5, stride=2, padding=1)
        #self.conv3 = nn.Conv1d(in_channels=self.hparams.num_channels, out_channels=self.hparams.num_channels // 2, kernel_size=1, padding=0, bias=True)
        
        # Biderectional LSTM
        self.lstm = torch.nn.LSTM(
            input_size=self.hparams.CONV2_out_channels,
            hidden_size=self.hparams.LSTM_num_features,
            num_layers=1,
            #dropout=0.25,
            bidirectional=True,
            batch_first=True,
            proj_size=self.hparams.LSTM_proj_size)

        # Multihead attention
        self.multihead_attn = MultiheadAttention(input_dim=self.hparams.LSTM_proj_size*2, embed_dim=self.hparams.LSTM_proj_size*2, num_heads=2) 

        self.flatten = nn.Flatten()

        if self.hparams.DIMRED:
            self.hparams.num_channels = self.hparams.num_channels // 2
      
        if self.hparams.LSTM:
            self.linear = nn.Sequential(
                nn.Linear((((self.hparams.seq_lenght - 3) // 2) - 2) * self.hparams.LSTM_proj_size, self.hparams.DENSE_kernelsize), #self.hparams.seq_lenght*self.hparams.LSTM_num_features*2
                nn.BatchNorm1d(self.hparams.DENSE_kernelsize),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(self.hparams.DENSE_kernelsize, 2))
        else:
            self.linear = nn.Sequential(
                nn.Linear(self.hparams.embedding_dim*2*128, self.hparams.DENSE_kernelsize),
                nn.ReLU(),
                nn.Dropout(0.25),
                nn.BatchNorm1d(self.hparams.DENSE_kernelsize),
                nn.Linear(self.hparams.DENSE_kernelsize, 2))


    def forward(self, inputs):

        #print(f"Input shape: {inputs.shape}")
        embeds = self.embedd(inputs).permute(0,2,1) #batch, embed_dim, seq_lenght
        #print(f"Embedding shape: {embeds.shape}")
        x = self.conv1(embeds) #batch, embed_dim, seq_lenght
        #print(f"Conv1 output: {x.shape}")
        if self.hparams.POOL:
            x = self.pool(x)
            #print(f"Pool output: {x.shape}")
        if self.hparams.CONV2:
            x = self.conv2(x) #batch, embed_dim, seq_lenght
            #print(f"Conv2 output: {x.shape}")
        if self.hparams.LSTM:
            x = x.permute(0,2,1) #batch, seq_lenght, embed_dim
            #print(f"LSTM input: {x.shape}")
            x,_ = self.lstm(x) #batch, seq_lenght , embed_dim
            #print(f"LSTM output: {x.shape}")
        if self.hparams.ATTN:
            x = self.multihead_attn(x) #batch, seq_lenght , embed_dim
            #print(f"ATTN output: {x.shape}")
            x = x.permute(0,2,1) #batch, embed_dim, seq_lenght
            #print(f"permute output: {x.shape}")
        if self.hparams.DIMRED:
            x = self.conv3(x)
            #print(f"DimReduction output: {x.shape}")
        x = self.flatten(x)
        #print(f"Flatten output: {x.shape}")
        x = self.linear(x)
        #print(x.shape)
  
        return F.log_softmax(x, dim=-1)


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = LambdaLR(optimizer, lambda epoch: self.decay_factor ** epoch)
        return [optimizer], [scheduler] 

    def training_step(self, batch, batch_idx):
        inputs, labels = batch['seq'], batch['label']
        outputs = self(inputs)
        preds = torch.max(outputs, 1)[1]

        criterion = nn.NLLLoss()
        loss = criterion(outputs, labels)

        labels = labels.cpu().detach().int()
        preds = preds.cpu().detach().int()

        train_acc = self.acc(preds, labels)
        train_auroc = self.auroc(preds, labels)
        
        return {"loss": loss,
                "train_acc": train_acc,
                "train_auroc": train_auroc}
                

    def training_epoch_end(self, train_step_outputs):
        loss = torch.stack([x["loss"] for x in train_step_outputs]).mean()
        train_acc_epoch = torch.stack([x["train_acc"] for x in train_step_outputs]).mean()
        train_auroc_epoch = torch.stack([x["train_auroc"] for x in train_step_outputs]).mean()
        
        self.log("train/epoch/loss", loss)
        self.log("train/epoch/acc", train_acc_epoch)
        self.log("train/epoch/auroc", train_auroc_epoch) 
    
    
    def validation_step(self, batch, batch_idx):
        inputs, labels = batch['seq'], batch['label']
        outputs = self(inputs)

        criterion = nn.NLLLoss()
        loss = criterion(outputs, labels)

        labels = labels.cpu().detach()
        preds = torch.max(outputs, 1)[1].cpu().detach()

        val_acc = self.acc(preds, labels)
        val_auroc = self.auroc(preds, labels)
        
        return {"loss": loss,
                "val_acc": val_acc,
                "val_auroc": val_auroc}


    def validation_epoch_end(self, val_step_outputs):
        loss = torch.stack([x["loss"] for x in val_step_outputs]).mean()
        val_acc_epoch = torch.stack([x["val_acc"] for x in val_step_outputs]).mean()
        val_auroc_epoch = torch.stack([x["val_auroc"] for x in val_step_outputs]).mean()
        
        self.log("val/epoch/loss", loss)
        self.log("val/epoch/acc", val_acc_epoch)
        self.log("val/epoch/auroc", val_auroc_epoch)

    
    def test_step(self, batch, batch_idx):
        inputs, labels = batch['seq'], batch['label']
        outputs = self(inputs)

        criterion = nn.NLLLoss()
        loss = criterion(outputs, labels)

        labels = labels.cpu().detach()
        preds = torch.max(outputs, 1)[1].cpu().detach()

        test_acc = self.acc(preds, labels)
        test_auroc = self.auroc(preds, labels)
        
        return {"loss": loss,
                "test_acc": test_acc,
                "test_auroc": test_auroc}

    def test_epoch_end(self, test_step_outputs):
        loss = torch.stack([x["loss"] for x in test_step_outputs]).mean()
        test_acc_epoch = torch.stack([x["test_acc"] for x in test_step_outputs]).mean()
        test_auroc_epoch = torch.stack([x["test_auroc"] for x in test_step_outputs]).mean()
        
        self.log("test/loss", loss)
        self.log("test/acc", test_acc_epoch)
        self.log("test/auroc", test_auroc_epoch)

        return test_auroc_epoch

    def predict_step(self, batch, batch_idx):
        inputs, labels = batch['seq'], batch['label']
        outputs = self(inputs)
        labels = labels.cpu().detach()
        preds = torch.max(outputs, 1)[1].cpu().detach()

        return inputs, labels, preds

In [5]:
192 // 2

96

In [6]:
def make_datasets(train_df, test_df, tokenizer, longest_seq, val_size):

    trainset = Rbp24Dataset(train_df, tokenizer, longest_seq, transform=transforms.Compose([ToTensor()]))
    testset = Rbp24Dataset(test_df, tokenizer, longest_seq, transform=transforms.Compose([ToTensor()]))

    train_labels = [int(trainset[i]['label']) for i in range(len(trainset)-1)]
    train_idx, val_idx= train_test_split(np.arange(len(train_labels)), test_size=val_size, shuffle=True, stratify=train_labels)

    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)

    return trainset, testset, train_sampler, val_sampler


In [7]:
def make_dataloaders(train_df, test_df, tokenizer, longest_seq, batch_size, num_workers, val_split):

    trainset, testset, train_sampler, val_sampler = make_datasets(train_df, test_df, tokenizer, longest_seq, val_split)

    trainloader = DataLoader(trainset, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers)
    valloader = DataLoader(trainset, batch_size=batch_size, num_workers=num_workers, sampler=val_sampler, shuffle=False)
    testloader = DataLoader(testset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    return trainloader, valloader, testloader

In [8]:
def concatanate(dataframes):
    train_df = pd.concat([dataframes[0], dataframes[1]], ignore_index=True).sample(frac=1).reset_index(drop=True)
    test_df = pd.concat([dataframes[2], dataframes[3]], ignore_index=True).sample(frac=1).reset_index(drop=True)
    return train_df, test_df

In [9]:
"""
dataset_path = "/home/mrkvrbl/Diplomka/Data/rbp24/processed" #/home/mrkvrbl/Diplomka/Data/rbp31/
protein = "PARCLIP_ELAVL1A"
tokenizer = Tokenizer.from_file(f'/home/mrkvrbl/Diplomka/Data/tokenizers/transcriptome_hg19_{PARAMS["vocab_size"]}words_bpe.tokenizer.json')
PARAMS['name'] = protein

train_path = dataset_path + "/" + protein + "/train/original.tsv.gz"
test_path = dataset_path + "/" + protein + "/test/original.tsv.gz"

train_df = pd.read_csv(train_path, delimiter="\t", index_col=0, header=0, compression="gzip")
test_df = pd.read_csv(test_path, delimiter="\t", index_col=0, header=0, compression="gzip")

longest_seq = max(max([len(tokenizer.encode(seq).ids) for seq in train_df.seq]), max([len(tokenizer.encode(seq).ids) for seq in test_df.seq]))
PARAMS['seq_lenght'] = longest_seq

trainloader, valloader, testloader = make_dataloaders(train_df, test_df, tokenizer, longest_seq, PARAMS["batch_size"], PARAMS["num_workers"], PARAMS["val_split"])
"""

'\ndataset_path = "/home/mrkvrbl/Diplomka/Data/rbp24/processed" #/home/mrkvrbl/Diplomka/Data/rbp31/\nprotein = "PARCLIP_ELAVL1A"\ntokenizer = Tokenizer.from_file(f\'/home/mrkvrbl/Diplomka/Data/tokenizers/transcriptome_hg19_{PARAMS["vocab_size"]}words_bpe.tokenizer.json\')\nPARAMS[\'name\'] = protein\n\ntrain_path = dataset_path + "/" + protein + "/train/original.tsv.gz"\ntest_path = dataset_path + "/" + protein + "/test/original.tsv.gz"\n\ntrain_df = pd.read_csv(train_path, delimiter="\t", index_col=0, header=0, compression="gzip")\ntest_df = pd.read_csv(test_path, delimiter="\t", index_col=0, header=0, compression="gzip")\n\nlongest_seq = max(max([len(tokenizer.encode(seq).ids) for seq in train_df.seq]), max([len(tokenizer.encode(seq).ids) for seq in test_df.seq]))\nPARAMS[\'seq_lenght\'] = longest_seq\n\ntrainloader, valloader, testloader = make_dataloaders(train_df, test_df, tokenizer, longest_seq, PARAMS["batch_size"], PARAMS["num_workers"], PARAMS["val_split"])\n'

In [10]:
PARAMS = {
        "model": "ECLAD",
        "vocab_size": 9,
        "embedding_dim": 8,
        "CONV1_kernelsize": 9,
        "CONV1_out_channels": 12,
        "CONV2": False,
        "CONV2_out_channels": 16,
        "LSTM": True,
        "LSTM_num_features": 8,
        "ATTN": True,
        "DIMRED": False,
        "DENSE_kernelsize":64,
        "batch_size": 256,
        "learning_rate": 0.005,
        "decay_factor": 0.95,
        "max_epochs": 100,
        "num_workers": 16,
        "val_split": 0.1
    }

In [15]:
dataset_path = "/home/mrkvrbl/Diplomka/Data/rbp24/processed"
protein = "ICLIP_TIA1"
tokenizer = Tokenizer.from_file(f'/home/mrkvrbl/Diplomka/Data/tokenizers/transcriptome_hg19_{PARAMS["vocab_size"]}words_bpe.tokenizer.json')
PARAMS['name'] = protein

#rbp24
train_df = pd.read_csv(dataset_path + "/" + protein +  "/train/original.tsv.gz", delimiter="\t", index_col=0, header=0, compression="gzip")
test_df = pd.read_csv(dataset_path + "/" + protein +  "/test/original.tsv.gz", delimiter="\t", index_col=0, header=0, compression="gzip")

#rbp31
#train_df = pd.read_csv(dataset_path + "/" + protein +  "/train/original.tsv.gz", delimiter="\t", index_col=0, compression="gzip")
#test_df = pd.read_csv(dataset_path + "/" + protein +  "/test/original.tsv.gz", delimiter="\t", index_col=0, compression="gzip")

longest_seq = max(max([len(tokenizer.encode(seq).ids) for seq in train_df.seq]), max([len(tokenizer.encode(seq).ids) for seq in test_df.seq]))
PARAMS['seq_lenght'] = longest_seq

trainloader, valloader, testloader = make_dataloaders(train_df, test_df, tokenizer, longest_seq, PARAMS["batch_size"], PARAMS["num_workers"], PARAMS["val_split"])

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import gensim
import multiprocessing

In [None]:
def gen_words(sequences,kmer_len,s):
	out=[]

	for i in sequences:
		kmer_list=[]
		for j in range(0,(len(i)-kmer_len)+1,s):
			kmer_list.append(i[j:j+kmer_len])

		out.append(kmer_list)

	return out

In [None]:
def convert_data_to_index(string_data, wv):
	index_data = []
	for word in string_data:
		if word in wv:
			index_data.append(wv.vocab[word].index)
	return index_data

In [None]:
words = gen_words(train_df.seq, 4, 1)
word2vec_model = "./wordtovec"

model = gensim.models.Word2Vec(words, window=int(12 / 1), min_count=0,workers=multiprocessing.cpu_count())
model.train(words,total_examples=len(words),epochs=100)
model.save(word2vec_model)

In [None]:
model1 = gensim.models.Word2Vec.load(word2vec_model)
weights = torch.FloatTensor(model1.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights, freeze=False)

In [None]:
x_data= [convert_data_to_index(el,model.wv) for el in words]

In [26]:
seq = train_df.seq[1]
tokens = torch.tensor(tokenizer.encode(seq).ids, dtype=torch.long).unsqueeze(0)
tokens

tensor([[5, 5, 6, 6, 9, 9, 5, 5, 5, 5, 5, 5, 5, 9, 9, 9, 5, 6, 5, 5, 7, 6, 9, 9,
         7, 9, 9, 9, 5, 7, 5, 9, 7, 9, 9, 5, 6, 5, 5, 5, 5, 9, 9, 6, 5, 7, 9, 5,
         9, 5, 7, 5, 5, 9, 5, 7, 9, 7, 7, 9, 5, 5, 5, 7, 5, 5, 9, 7, 9, 5, 7, 7,
         6, 9, 6, 9, 7, 9, 5, 7, 6, 6, 9, 5, 6, 6, 9, 7, 6, 9, 9, 7, 5, 7, 9, 9,
         9, 7, 5, 5, 9, 5, 6, 5, 7, 7, 6, 9, 6, 9, 6, 5, 6, 6, 9, 5, 6, 9, 5, 7,
         6, 9, 7, 5, 7, 9, 7, 9]])

In [15]:
seq = train_df.seq[3]
tokens = torch.tensor(tokenizer.encode(seq).ids, dtype=torch.long).unsqueeze(0)
print(tokens.shape)
embed = nn.Embedding(128, 8)
conv1 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=5, padding=1)
pool = nn.MaxPool1d(3, stride=1, padding=1)
conv2 = nn.Conv1d(16,32,3)
lstm = nn.LSTM(32,32, batch_first=True, proj_size=16)
result = embed(tokens).permute(0,2,1)
print(result.shape)
result = conv1(result)
print(result.shape)
result = pool(result)
print(result.shape)
result = conv2(result)
print(result.shape)
result = lstm(result.permute(0,2,1))
print(result[0].shape)
result[0].flatten().shape

torch.Size([1, 54])
torch.Size([1, 8, 54])
torch.Size([1, 16, 52])
torch.Size([1, 16, 52])
torch.Size([1, 32, 50])
torch.Size([1, 50, 16])


torch.Size([800])

In [18]:
it = iter(trainloader)
batch = next(it)
seq, label = batch['seq'], batch['label']
print(f"seq shape: {seq.shape}\nlabel shape: {label.shape}")
seq[1]

seq shape: torch.Size([256, 128])
label shape: torch.Size([256])


tensor([15, 11, 58, 15, 15, 15, 15, 15, 15, 15, 43, 43, 43, 43, 43, 14, 32, 13,
        19, 62, 15, 15, 55, 63, 55, 45, 18, 10, 21, 13, 45, 33,  7, 29, 15, 36,
        15, 16, 21, 10, 12, 24, 49, 54, 22, 30,  5,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])

In [14]:
early_stopping = EarlyStopping('val/epoch/loss', patience=15, check_on_train_epoch_end=False, )

checkpoints_path = "checkpoints/" + PARAMS["name"]

model_checkpoint = ModelCheckpoint(
        dirpath=checkpoints_path,
        filename="{epoch:02d}",
        save_weights_only=True,
        save_top_k=-1,
        save_last=True,
        monitor="val/epoch/loss",
        every_n_epochs=1)

In [15]:
# create NeptuneLogger
#neptune_logger = NeptuneLogger(
#    api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxZDI3YzE1Yy0yYzllLTRjM2YtYjk2MS1jNzNiZmI3MzIyNWEifQ==",  # replace with your own
#    project="mrkvrbl/MasterThesis",  # "<WORKSPACE/PROJECT>"
#    name=PARAMS["name"])

In [16]:
trainer = Trainer(#logger=neptune_logger,
                callbacks=[model_checkpoint, early_stopping],
                max_epochs=PARAMS['max_epochs'],
                accumulate_grad_batches=1,
                gradient_clip_val=0.5,
                stochastic_weight_avg=True,
                gpus=1)
model = AttnCNN(PARAMS)

  "Setting `Trainer(stochastic_weight_avg=True)` is deprecated in v1.5 and will be removed in v1.7."
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [17]:
#neptune_logger.log_model_summary(model=model, max_depth=-1)
#neptune_logger.log_hyperparams(params=PARAMS)

In [18]:
trainer.fit(model, train_dataloaders=trainloader, val_dataloaders=valloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | auroc          | AUROC              | 0     
1 | acc            | Accuracy           | 0     
2 | embedd         | Embedding          | 512   
3 | conv1          | Sequential         | 688   
4 | conv2          | Sequential         | 2.7 K 
5 | pool           | MaxPool1d          | 0     
6 | lstm           | LSTM               | 11.3 K
7 | multihead_attn | MultiheadAttention | 952   
8 | flatten        | Flatten            | 0     
9 | linear         | Sequential         | 37.7 K
------------------------------------------------------
53.8 K    Trainable params
0         Non-trainable params
53.8 K    Total params
0.215     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]



                                                                      

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Epoch 12:   0%|          | 0/10 [00:00<?, ?it/s, loss=0.63, v_num=148]          

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Epoch 12:   0%|          | 0/10 [00:19<?, ?it/s, loss=0.63, v_num=148]

In [39]:
checkpoints = sorted(os.listdir(checkpoints_path))
best_auc = 0

for checkpoint in checkpoints:
    checkpoint_path = str(checkpoints_path) + "/" + checkpoint
    test_out = trainer.test(model, ckpt_path=checkpoint_path, dataloaders=testloader)[0]

    if test_out["test/auroc"] > best_auc:
        best_auc = test_out["test/auroc"]
        best_checkpoint_path = checkpoint_path
print(best_auc, best_checkpoint_path)

Restoring states from the checkpoint path at checkpoints/C22ORF28_Baltz2012/epoch=00-v1.ckpt


RuntimeError: Error(s) in loading state_dict for AttnCNN:
	size mismatch for embedd.weight: copying a param with shape torch.Size([128, 8]) from checkpoint, the shape in current model is torch.Size([512, 8]).

In [19]:
test_out = trainer.test(model, ckpt_path=checkpoint_path, dataloaders=testloader)

Restoring states from the checkpoint path at checkpoints/C22ORF28_Baltz2012/last.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at checkpoints/C22ORF28_Baltz2012/last.ckpt


Testing: 100%|██████████| 4/4 [00:00<00:00,  6.21it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/acc': 0.6973329782485962,
 'test/auroc': 0.6963504552841187,
 'test/loss': 0.5539897084236145}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 4/4 [00:00<00:00,  5.25it/s]


In [20]:
predict_out = trainer.predict(model, ckpt_path=checkpoint_path, dataloaders=testloader, return_predictions=True)

Restoring states from the checkpoint path at checkpoints/C22ORF28_Baltz2012/last.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at checkpoints/C22ORF28_Baltz2012/last.ckpt


Predicting: 66it [00:00, ?it/s]


In [52]:
torch.stack([x[2] for x in predict_out]).mean()

RuntimeError: stack expects each tensor to be equal size, but got [256] at entry 0 and [10] at entry 1

Run MST-348 received stop signal. Exiting


Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.


All 2 operations synced, thanks for waiting!


In [None]:
checkpoints = sorted(os.listdir(checkpoints_path))
best_auc = 0

for checkpoint in checkpoints:
    checkpoint_path = str(checkpoints_path) + "/" + checkpoint
    predict_out = trainer.predict(model, ckpt_path=checkpoint_path, dataloaders=testloader, return_predictions=True)

    inputs = []
    labels = []
    preds = []
    for outs in predict_out:
        inputs.append(outs[0])
        labels.append(outs[1])
        preds.append(outs[2])

    inputs = [item for sublist in inputs for item in sublist]
    labels = [item for sublist in labels for item in sublist]
    preds = [item for sublist in preds for item in sublist]

    auroc = AUROC(num_classes=1)
    auc = auroc(torch.tensor(preds), torch.tensor(labels))

    if auc > best_auc:
        best_auc = auc
    print(auc)
print(best_auc)

In [19]:
import shutil

dst_path = "/home/mrkvrbl/Diplomka/best_checkpoints/" + PARAMS['name'] +"/best.ckpt"
shutil.copyfile(best_checkpoint_path, dst_path)

'/home/mrkvrbl/Diplomka/best_checkpoints/CAPRIN1_Baltz2012/best.ckpt'

In [22]:
#test_model = AttnCNN(PARAMS)
#best_checkpoint = "/home/mrkvrbl/Diplomka/src/checkpoints/ALKBH5_Baltz2012/epoch=05.ckpt"
#test_model.load_from_checkpoint(best_checkpoint_path)

#test_trainer = Trainer()

predict_out = trainer.predict(model, ckpt_path=best_checkpoint_path, dataloaders=testloader, return_predictions=True)

Restoring states from the checkpoint path at checkpoints/ALKBH5_Baltz2012/epoch=43.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at checkpoints/ALKBH5_Baltz2012/epoch=43.ckpt


Predicting: 9it [00:00, ?it/s]


In [36]:
inputs = []
labels = []
preds = []
for outs in predict_out:
    inputs.append(outs[0])
    labels.append(outs[1])
    preds.append(outs[2])

inputs = [item for sublist in inputs for item in sublist]
labels = [item for sublist in labels for item in sublist]
preds = [item for sublist in preds for item in sublist]


In [40]:
FP, FN = [], []

for i in range(len(inputs)):
    if labels[i] < preds[i]:
        FP.append(inputs[i])
    elif labels[i] > preds[i]:
        FN.append(inputs[i])
    else:
        continue

FP = ["".join(tokenizer.decode(FP[i].numpy()).split(" ")) for i in range(len(FP))]
FN = ["".join(tokenizer.decode(FN[i].numpy()).split(" ")) for i in range(len(FN))]

tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
(tn, fp, fn, tp)

auroc = AUROC(num_classes=1)
auc = auroc(torch.tensor(preds), torch.tensor(labels))
print(auc, auroc)

tensor(0.6515) AUROC()




In [31]:
#save FP and FN
train_df[train_df.label == 1].seq

1        attagactataagtatcttgaagataaggtcaataaacactcatca...
4        tgctgtgccacacaatttactgagacaatcatatcttcctaagcat...
5        tatattgactatatctctttttattcccccttttttttttttaaag...
6        agcacatactgttgcctttggggggaaacaaatttcctggatatgt...
7        tattatattgtgtcttattttccctttgcaggctggtttaccatga...
                               ...                        
16025    aagaagaggaagaatagcaagaagaagaaccagccgggcaagtaca...
16031    agcacatccaatggagtcagttcaacctccaaggcagaagctgtag...
16034    tacatatatagtcactggcatactgagaatatacaatgatcctgga...
16035    tcctagtgccaaaggttcaacttaatgtatatacctgaaaacccat...
16037    atcttccattatcaaatgcagatacatagaaaggcagtacatcagg...
Name: seq, Length: 8140, dtype: object

In [32]:
#pos gc: 49.06997114591921	neg gc: 45.089285714285715
from utils.utils import get_gc_count_from_seq
pos_gc = get_gc_count_from_seq(train_df[train_df.label == 1].seq)
neg_gc = get_gc_count_from_seq(train_df[train_df.label == 0].seq)
fp_gc = get_gc_count_from_seq(FP)
fn_gc = get_gc_count_from_seq(FN)

print(f" Pos GC: {pos_gc}\n Neg GC: {neg_gc}\n fp_GC: {fp_gc}\n fn_GC: {fn_gc}")

 Pos gc: 43.728573309797575
 Neg GC: 44.49588494800405
 fp_gc: 45.34202383303707
 fn_gc: 41.51364338424575


In [None]:
#import shutil
#shutil.rmtree(checkpoints_path)

In [None]:
from sklearn.linear_model import LogisticRegression

def baseline_model_torch_metrics(X_train, X_test, y_train, y_test, max_iter=200):
    baseline = LogisticRegression(max_iter=max_iter, random_state=42)

    # flatten the data
    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)

    baseline.fit(X_train_flat, y_train)

    y_train = torch.tensor(y_train.values.astype(int))
    y_test = torch.tensor(y_test.values.astype(int))


    baseline_pred_train = torch.from_numpy(baseline.predict(X_train_flat)).int()
    baseline_pred_test = torch.from_numpy(baseline.predict(X_test_flat)).int()

    auroc = AUROC(num_classes=1)
    acc = Accuracy()

    train_acc_score = acc(y_train, baseline_pred_train)
    test_acc_score = auroc(y_test, baseline_pred_test)

    train_auc_score = acc(y_train, baseline_pred_train)
    test_auc_score = auroc(y_test, baseline_pred_test)

    print(f"train_auc_score: {train_auc_score}\ntest_auc_score: {test_auc_score}\ntrain_acc_score: {train_acc_score}\ntest_acc_score: {test_acc_score}")

In [None]:
from utils.utils import get_X_y

X_train, y_train = get_X_y(train_df)
X_test, y_test = get_X_y(test_df)

result = baseline_model_torch_metrics(X_train, X_test, y_train, y_test, max_iter=200)

ValueError: all input arrays must have the same shape

In [None]:
# ADD EMBEDING AND CONFUSION TABLE

Experiencing connection interruptions. Will try to reestablish communication with Neptune. Internal exception was: ReadTimeout
Error occurred during asynchronous operation processing: Cannot upload file /home/mrkvrbl/Diplomka/src/checkpoints/PARCLIP_MOV10_Sievers/epoch=00.ckpt: Path not found or is a not a file.
Error occurred during asynchronous operation processing: Cannot upload file /home/mrkvrbl/Diplomka/src/checkpoints/PARCLIP_MOV10_Sievers/epoch=01.ckpt: Path not found or is a not a file.
Error occurred during asynchronous operation processing: Cannot upload file /home/mrkvrbl/Diplomka/src/checkpoints/PARCLIP_MOV10_Sievers/epoch=02.ckpt: Path not found or is a not a file.
Error occurred during asynchronous operation processing: Cannot upload file /home/mrkvrbl/Diplomka/src/checkpoints/PARCLIP_MOV10_Sievers/epoch=03.ckpt: Path not found or is a not a file.
Error occurred during asynchronous operation processing: Cannot upload file /home/mrkvrbl/Diplomka/src/checkpoints/PARCLIP_MO