In [None]:
# conda list --export > requirements_conda.txt

In [2]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

from genomic_benchmarks.dset_getters import demo_mouse_enhancers_dset
from utils import simple_coll, padding_coll_factory

## NN model

In [3]:
# A basic CNN model
class NeuralNetwork(nn.Module):
        
    def __init__(self, number_of_classes, vocab_size, embedding_dim, context_size):
        super(NeuralNetwork, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=16, kernel_size=8, bias=True)
        self.norm1 = nn.BatchNorm1d(16)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(2)
        
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=8, bias=True)
        self.norm2 = nn.BatchNorm1d(8)
        self.pool2 = nn.MaxPool1d(2)

        self.conv3 = nn.Conv1d(in_channels=8, out_channels=4, kernel_size=8, bias=True)
        self.norm3 = nn.BatchNorm1d(4)
        self.pool3 = nn.MaxPool1d(2)

        self.conv4 = nn.Conv1d(in_channels=4, out_channels=3, kernel_size=8, bias=True)
        self.norm4 = nn.BatchNorm1d(3)
        self.pool4 = nn.MaxPool1d(2)
        
        self.flatten = nn.Flatten()
        self.lin1 = nn.Linear(861, 512)
        self.lin2 = nn.Linear(512, number_of_classes)
        self.sigmoid = nn.Sigmoid()
        self.loss = torch.nn.functional.binary_cross_entropy_with_logits

        
    def forward(self, x):
        x = self.embeddings(x)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu(x)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.pool2(x)
        
        x = self.conv3(x)
        x = self.norm3(x)
        x = self.relu(x)
        x = self.pool3(x)
        
        x = self.conv4(x)
        x = self.norm4(x) 
        x = self.relu(x)
        x = self.pool4(x)
        
        x = self.flatten(x)
        x = self.lin1(x) 
        x = self.lin2(x) 
        x = self.sigmoid(x)
        return x

        
    def train_loop(self, dataloader, optimizer):
        for x, y in dataloader:
            optimizer.zero_grad()
            pred = self(x)
#             print('pred: ', pred)
#             print('y: ', y)
            loss = self.loss(pred, y)
#             print(loss)
            loss.backward()
            optimizer.step()
        
#       train acc
# todo: optimize counting of acc
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        train_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                train_loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()

#         print('train_loss ', train_loss)
#         print('num_batches', num_batches)
#         print('correct', correct)
#         print('size', size)
        train_loss /= num_batches
        correct /= size
        print(f"Train metrics: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
            
            
    def train(self, dataloader, epochs):
        optimizer = torch.optim.Adam(self.parameters())
        for t in range(epochs):
            print(f"Epoch {t}")
            self.train_loop(dataloader, optimizer)

    def test(self, dataloader):
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        test_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                test_loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()

        print('test_loss ', test_loss)
        print('num_batches', num_batches)
        print('correct', correct)
        print('size', size)

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


### Choose the dataset

In [5]:
# choose the dataset
get_dataset_fn = demo_mouse_enhancers_dset
train_dset = get_dataset_fn('train', force_download=False)


Reference /home/jupyter/.genomic_benchmarks/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa.gz already exists. Skipping.




  0%|          | 0/21 [00:00<?, ?it/s]

## Tokenizer and vocab

In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, build_vocab_from_iterator
from collections import Counter 

class LetterTokenizer():
    def __init__(self, **kwargs):
        pass
    def __call__(self, items):
        if isinstance(items, str):
            return self.__tokenize_str(items)
        else:
            return (self.__tokenize_str(t) for t in items)
    def __tokenize_str(self, t):
        tokenized = list(t.replace("\n",""))
        tokenized.append('<eos>')
        tokenized.insert(0,'<bos>')
        return tokenized

tokenizer = get_tokenizer(LetterTokenizer())

def build_vocab(dataset, tokenizer):
    counter = Counter()
    for i in range(len(dataset)):
        counter.update(tokenizer(dataset[i][0]))
    print(counter.most_common())
    builded_voc = vocab(counter)
    builded_voc.append_token('<pad>')
    return builded_voc

# todo: why build fn does not work as expected (iterator argument)
#     return build_vocab_from_iterator(
#         iterator = counter, 
#         specials = ['<unk>', '<pad>', '<bos>', '<eos>'],
#         special_first = True)

vocabulary = build_vocab(train_dset, tokenizer)
print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

[('N', 664278), ('T', 436517), ('A', 436144), ('C', 364382), ('G', 360709), ('<bos>', 968), ('<eos>', 968)]
vocab len: 8
{'N': 5, '<pad>': 7, 'T': 3, 'C': 4, 'A': 2, '<eos>': 6, 'G': 1, '<bos>': 0}


In [18]:
# tokenizer(train_dset[0][0])

# labels statistics
zeros, ones = 0, 0
for i in range(len(train_dset)):
    if train_dset[i][1] == 0: 
        zeros += 1
    if train_dset[i][1] == 1: 
        ones += 1
print("zeros, ", zeros)
print("ones, ", ones)

zeros,  484
ones,  484


### use collate with padding

In [8]:
# use collate with padding
input_len = max([len(train_dset[i][0]) for i in range(len(train_dset))])
print("input_len ", input_len)
# padding_coll_factory(longest_length, vocab, tokenizer):
collate = padding_coll_factory(input_len, vocabulary, tokenizer)

train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=collate)


input_len  4707


In [9]:
print(next(iter(train_loader))[0][0])
print(next(iter(train_loader))[1][0])
print(next(iter(train_loader))[0][2])
print(next(iter(train_loader))[1][2])

  x = torch.tensor(tmp, dtype=torch.long)


tensor([0, 3, 2,  ..., 7, 7, 7], device='cuda:0')
tensor([1.], device='cuda:0')
tensor([0, 3, 1,  ..., 7, 7, 7], device='cuda:0')
tensor([1.], device='cuda:0')


## Training

In [10]:
# __init__(self, number_of_classes, vocab_size, embedding_dim, context_size):
model = NeuralNetwork(
    number_of_classes=1,
    vocab_size=vocabulary.__len__(),
    embedding_dim=100,
    context_size=input_len
).cuda()
model.train(train_loader, epochs=10)

Epoch 0


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


Train metrics: 
 Accuracy: 56.8%, Avg loss: 0.669036 

Epoch 1
Train metrics: 
 Accuracy: 64.2%, Avg loss: 0.638177 

Epoch 2
Train metrics: 
 Accuracy: 70.2%, Avg loss: 0.617051 

Epoch 3
Train metrics: 
 Accuracy: 77.2%, Avg loss: 0.603379 

Epoch 4
Train metrics: 
 Accuracy: 78.6%, Avg loss: 0.599723 

Epoch 5
Train metrics: 
 Accuracy: 72.8%, Avg loss: 0.606703 

Epoch 6
Train metrics: 
 Accuracy: 79.6%, Avg loss: 0.597159 

Epoch 7
Train metrics: 
 Accuracy: 80.6%, Avg loss: 0.591696 

Epoch 8
Train metrics: 
 Accuracy: 79.4%, Avg loss: 0.583165 

Epoch 9
Train metrics: 
 Accuracy: 81.6%, Avg loss: 0.580159 



## Testing

In [11]:
test_dset = get_dataset_fn('test', force_download=False)
test_loader = DataLoader(test_dset, batch_size=32, shuffle=True, collate_fn=collate)
model.test(test_loader)

Reference /home/jupyter/.genomic_benchmarks/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa.gz already exists. Skipping.




  0%|          | 0/21 [00:00<?, ?it/s]

test_loss  4.867299497127533
num_batches 8
correct 179
size 242
Test Error: 
 Accuracy: 74.0%, Avg loss: 0.608412 

