In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import DemoCodingVsIntergenomicSeqs
from utils import coll_factory, LetterTokenizer, build_vocab
from cnn_model import CNN

## Choose the dataset

In [4]:
get_dataset_fn = DemoCodingVsIntergenomicSeqs
train_dset = get_dataset_fn('train', force_download=False, version=0)

Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.


  0%|          | 0/190000 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

## Tokenizer and vocab

In [5]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=False)

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 6
{'C': 4, 'A': 3, '<eos>': 5, 'G': 2, 'T': 1, '<bos>': 0}


## Batch preparation with collate

In [6]:
RUN_ON_GPU = True
device = 'cuda' if RUN_ON_GPU and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

collate = coll_factory(vocabulary, tokenizer, device)
train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=collate)

Using cuda device


## Model

In [7]:
max_seq_len = max([len(train_dset[i][0]) for i in range(len(train_dset))])
print("max_seq_len ", max_seq_len)
# Count in added in tokenizer '<bos>' and '<eos>' 
nn_input_len = max_seq_len+2

model = CNN(
    number_of_classes=1,
    vocab_size=vocabulary.__len__(),
    embedding_dim=100,
    input_len=nn_input_len
).to(device)

max_seq_len  200


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


## Training

In [8]:
model.train(train_loader, epochs=10)

Epoch 0
Train metrics: 
 Accuracy: 85.6%, Avg loss: 0.567541 

Epoch 1
Train metrics: 
 Accuracy: 86.2%, Avg loss: 0.567203 

Epoch 2
Train metrics: 
 Accuracy: 86.2%, Avg loss: 0.563248 

Epoch 3
Train metrics: 
 Accuracy: 86.0%, Avg loss: 0.563567 

Epoch 4
Train metrics: 
 Accuracy: 86.2%, Avg loss: 0.562284 

Epoch 5
Train metrics: 
 Accuracy: 86.6%, Avg loss: 0.564464 

Epoch 6
Train metrics: 
 Accuracy: 87.9%, Avg loss: 0.559640 

Epoch 7
Train metrics: 
 Accuracy: 86.8%, Avg loss: 0.559113 

Epoch 8
Train metrics: 
 Accuracy: 88.0%, Avg loss: 0.557585 

Epoch 9
Train metrics: 
 Accuracy: 85.8%, Avg loss: 0.563001 



## Testing

In [9]:
test_dset = get_dataset_fn('test', force_download=False, version=0)
test_loader = DataLoader(test_dset, batch_size=32, shuffle=True, collate_fn=collate)

model.test(test_loader)

Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.


  0%|          | 0/190000 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

test_loss  444.18676272034645
num_batches 782
correct 21209
size 25000
Test Error: 
 Accuracy: 84.8%, Avg loss: 0.568014 

