In [1]:
# conda list --export > requirements_conda.txt

In [2]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

from genomic_benchmarks.dset_getters import demo_coding_vs_intergenomic_seqs_dset
from utils import simple_coll_factory, padding_coll_factory

## NN model

In [3]:
# A basic CNN model
class NeuralNetwork(nn.Module):
        
    def __init__(self, number_of_classes, vocab_size, embedding_dim, input_len):
        super(NeuralNetwork, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=16, kernel_size=8, bias=True)
        self.norm1 = nn.BatchNorm1d(16)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(2)
        
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=8, bias=True)
        self.norm2 = nn.BatchNorm1d(8)
        self.pool2 = nn.MaxPool1d(2)

        self.conv3 = nn.Conv1d(in_channels=8, out_channels=4, kernel_size=8, bias=True)
        self.norm3 = nn.BatchNorm1d(4)
        self.pool3 = nn.MaxPool1d(2)

#         self.conv4 = nn.Conv1d(in_channels=4, out_channels=3, kernel_size=8, bias=True)
#         self.norm4 = nn.BatchNorm1d(3)
#         self.pool4 = nn.MaxPool1d(2)
    
#         compute output shape of conv layers   
        self.flatten = nn.Flatten()
        self.lin1 = nn.Linear(self.count_flatten_size(input_len), 512)
        self.lin2 = nn.Linear(512, number_of_classes)
        self.sigmoid = nn.Sigmoid()
        self.loss = torch.nn.functional.binary_cross_entropy_with_logits

    def count_flatten_size(self, input_len):        
        zeros = torch.zeros([1, input_len], dtype=torch.long)
        x = self.embeddings(zeros)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = self.norm3(x)
        x = self.relu(x)
        x = self.pool3(x)

        x = self.flatten(x)
        print("flatten size ", x.size()[1])
        return x.size()[1]
        
    def forward(self, x):
#         print("start ", x.size())
        x = self.embeddings(x)
        x = x.transpose(1, 2)
#         print("emb ", x.size())
        x = self.conv1(x)
#         print("conv1 ", x.size())
        x = self.norm1(x)
        x = self.relu(x)
        x = self.pool1(x)
#         print("pool1 ", x.size())
        
        
        x = self.conv2(x)
#         print("conv2 ", x.size())
        x = self.norm2(x)
        x = self.relu(x)
        x = self.pool2(x)
#         print("pool2 ", x.size())
        
        x = self.conv3(x)
#         print("conv3 ", x.size())
        x = self.norm3(x)
        x = self.relu(x)
        x = self.pool3(x)
#         print("pool3 ", x.size())
        
#         x = self.conv4(x)
#         x = self.norm4(x) 
#         x = self.relu(x)
#         x = self.pool4(x)
        
        x = self.flatten(x)
#         print("flatten ", x.size())
        x = self.lin1(x) 
        x = self.lin2(x) 
        x = self.sigmoid(x)
        return x

        
    def train_loop(self, dataloader, optimizer):
        for x, y in dataloader:
            optimizer.zero_grad()
            pred = self(x)
#             print('pred: ', pred)
#             print('y: ', y)
            loss = self.loss(pred, y)
#             print(loss)
            loss.backward()
            optimizer.step()
        
#       train acc
# todo: optimize counting of acc
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        train_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                train_loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()

#         print('train_loss ', train_loss)
#         print('num_batches', num_batches)
#         print('correct', correct)
#         print('size', size)
        train_loss /= num_batches
        correct /= size
        print(f"Train metrics: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
            
            
    def train(self, dataloader, epochs):
        optimizer = torch.optim.Adam(self.parameters())
        for t in range(epochs):
            print(f"Epoch {t}")
            self.train_loop(dataloader, optimizer)

    def test(self, dataloader):
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        test_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                test_loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()

        print('test_loss ', test_loss)
        print('num_batches', num_batches)
        print('correct', correct)
        print('size', size)

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


### Choose the dataset

In [5]:
# choose the dataset
get_dataset_fn = demo_coding_vs_intergenomic_seqs_dset

# TODO download again
train_dset = get_dataset_fn('train', force_download=False)


Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.




  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/190000 [00:00<?, ?it/s]

## Tokenizer and vocab

In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, build_vocab_from_iterator
from collections import Counter 

class LetterTokenizer():
    def __init__(self, **kwargs):
        pass
    def __call__(self, items):
        if isinstance(items, str):
            return self.__tokenize_str(items)
        else:
            return (self.__tokenize_str(t) for t in items)
    def __tokenize_str(self, t):
        tokenized = list(t.replace("\n",""))
        tokenized.append('<eos>')
        tokenized.insert(0,'<bos>')
        return tokenized

tokenizer = get_tokenizer(LetterTokenizer())

def build_vocab(dataset, tokenizer):
    counter = Counter()
    for i in range(len(dataset)):
        counter.update(tokenizer(dataset[i][0]))
    print(counter.most_common())
    builded_voc = vocab(counter)
#     builded_voc.append_token('<pad>')
    return builded_voc

# todo: why build fn does not work as expected (iterator argument)
#     return build_vocab_from_iterator(
#         iterator = counter, 
#         specials = ['<unk>', '<pad>', '<bos>', '<eos>'],
#         special_first = True)

vocabulary = build_vocab(train_dset, tokenizer)
print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

[('A', 4076894), ('T', 3874778), ('G', 3557617), ('C', 3490711), ('<bos>', 75000), ('<eos>', 75000)]
vocab len: 6
{'C': 4, 'A': 3, '<eos>': 5, 'G': 2, 'T': 1, '<bos>': 0}


In [7]:
print(tokenizer(train_dset[0][0]))
print(train_dset[0][0])
print(train_dset[0][1])

['<bos>', 'T', 'G', 'G', 'A', 'G', 'T', 'A', 'A', 'T', 'G', 'G', 'T', 'C', 'C', 'A', 'T', 'G', 'G', 'A', 'G', 'T', 'C', 'T', 'A', 'G', 'A', 'C', 'T', 'G', 'C', 'C', 'T', 'A', 'G', 'G', 'G', 'A', 'T', 'T', 'G', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'G', 'A', 'T', 'G', 'G', 'G', 'G', 'T', 'T', 'G', 'A', 'T', 'A', 'A', 'C', 'G', 'G', 'A', 'G', 'A', 'A', 'T', 'G', 'T', 'A', 'G', 'A', 'G', 'G', 'A', 'G', 'G', 'C', 'A', 'A', 'G', 'G', 'G', 'A', 'C', 'T', 'G', 'A', 'A', 'G', 'G', 'T', 'C', 'C', 'A', 'T', 'G', 'A', 'A', 'G', 'G', 'C', 'T', 'G', 'T', 'C', 'T', 'G', 'A', 'A', 'C', 'T', 'T', 'G', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'A', 'A', 'G', 'A', 'A', 'T', 'G', 'G', 'A', 'G', 'C', 'A', 'A', 'C', 'T', 'G', 'G', 'G', 'G', 'A', 'G', 'A', 'A', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'C', 'A', 'G', 'G', 'G', 'T', 'A', 'T', 'G', 'A', 'C', 'C', 'T', 'T', 'T', 'G', 'T', 'C', 'T', 'G', 'T', 'T', 'G', 'G', 'G', 'A', 'C', 'A', 'A', 'A', 'G', 'G', 'T', 'C', 'A', 'C', 'C', 'T', 'C', 'A', 'G', 'T', 'T', 'G', 'T', 

### use collate with padding

In [8]:
# use collate with padding
input_len = max([len(train_dset[i][0]) for i in range(len(train_dset))])
print("input_len ", input_len)
# padding_coll_factory(longest_length, vocab, tokenizer):
collate = simple_coll_factory(vocabulary, tokenizer)

train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=collate)


input_len  200


In [9]:
print(next(iter(train_loader))[0][0])
print(next(iter(train_loader))[1][0])
print(next(iter(train_loader))[0][2])
print(next(iter(train_loader))[1][2])

  x = torch.tensor(tmp, dtype=torch.long)


tensor([0, 1, 4, 1, 2, 4, 4, 2, 2, 2, 1, 1, 4, 1, 4, 3, 2, 3, 4, 2, 4, 4, 4, 2,
        2, 4, 2, 4, 4, 2, 4, 1, 2, 3, 4, 1, 2, 4, 2, 2, 3, 2, 4, 2, 4, 2, 4, 1,
        2, 2, 2, 4, 1, 2, 4, 1, 2, 2, 3, 2, 2, 4, 1, 4, 4, 2, 4, 4, 3, 2, 2, 4,
        4, 4, 2, 1, 1, 4, 3, 4, 3, 2, 4, 4, 4, 4, 2, 3, 2, 2, 3, 4, 2, 4, 4, 2,
        2, 4, 1, 4, 2, 3, 2, 1, 1, 4, 4, 4, 1, 1, 4, 4, 3, 2, 1, 4, 4, 2, 4, 4,
        2, 2, 4, 2, 4, 4, 1, 2, 1, 4, 4, 2, 4, 2, 4, 1, 1, 2, 4, 4, 4, 2, 3, 4,
        1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 4, 4, 3, 4, 3, 4, 1, 1, 3, 2, 3, 2, 1, 1,
        1, 1, 3, 3, 3, 4, 1, 1, 3, 3, 4, 2, 3, 2, 4, 1, 1, 1, 1, 4, 3, 3, 3, 3,
        3, 3, 1, 3, 1, 1, 4, 1, 4, 5], device='cuda:0')
tensor([0.], device='cuda:0')
tensor([0, 3, 4, 1, 1, 4, 4, 3, 3, 3, 3, 3, 2, 1, 3, 2, 3, 3, 3, 1, 3, 2, 3, 4,
        1, 2, 3, 3, 1, 2, 4, 1, 1, 4, 4, 3, 3, 3, 4, 1, 4, 3, 2, 4, 4, 3, 2, 4,
        3, 4, 1, 3, 1, 1, 1, 2, 2, 3, 1, 3, 3, 4, 3, 3, 3, 2, 4, 4, 1, 2, 3, 4,
        3, 3, 3, 2, 2, 1, 3, 1, 4,

## Training

In [10]:
nn_input_len = input_len+2

# __init__(self, number_of_classes, vocab_size, embedding_dim, input_len):
model = NeuralNetwork(
    number_of_classes=1,
    vocab_size=vocabulary.__len__(),
    embedding_dim=100,
    input_len=nn_input_len
).cuda()

model.train(train_loader, epochs=10)
# model(torch.zeros([32, nn_input_len], dtype=torch.long).to('cuda'))

flatten size  76
Epoch 0


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


Train metrics: 
 Accuracy: 83.4%, Avg loss: 0.574151 

Epoch 1
Train metrics: 
 Accuracy: 85.2%, Avg loss: 0.568589 

Epoch 2
Train metrics: 
 Accuracy: 86.8%, Avg loss: 0.565124 

Epoch 3
Train metrics: 
 Accuracy: 87.0%, Avg loss: 0.561576 

Epoch 4
Train metrics: 
 Accuracy: 87.3%, Avg loss: 0.559768 

Epoch 5
Train metrics: 
 Accuracy: 87.5%, Avg loss: 0.560357 

Epoch 6
Train metrics: 
 Accuracy: 86.5%, Avg loss: 0.560485 

Epoch 7
Train metrics: 
 Accuracy: 87.7%, Avg loss: 0.557321 

Epoch 8
Train metrics: 
 Accuracy: 87.7%, Avg loss: 0.557491 

Epoch 9
Train metrics: 
 Accuracy: 87.5%, Avg loss: 0.557597 



## Testing

In [11]:
test_dset = get_dataset_fn('test', force_download=False)
test_loader = DataLoader(test_dset, batch_size=32, shuffle=True, collate_fn=collate)
model.test(test_loader)

Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.




  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/190000 [00:00<?, ?it/s]

test_loss  439.50119185447693
num_batches 782
correct 21672
size 25000
Test Error: 
 Accuracy: 86.7%, Avg loss: 0.562022 

