In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import DemoCodingVsIntergenomicSeqs
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config 
from cnn_model import CNN

## Config

In [3]:
config = {
    "use_padding": False,
    "run_on_gpu": True,
    "dataset": DemoCodingVsIntergenomicSeqs,
    "number_of_classes": 2,
    "dataset_version": 0,
    "force_download": False,
    "epochs": 15,
    "embedding_dim": 100,
    "batch_size": 32,
#   vocabulary that is not present in the training set but is present in the test set
    "vocab_to_add": ["N"],
}
check_config(config)

config is correct


## Choose the dataset

In [4]:
get_dataset_fn = config["dataset"]
train_dset = get_dataset_fn('train', force_download=config["force_download"], version=config["dataset_version"])

Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.


  0%|          | 0/190000 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

## Tokenizer and vocab

In [5]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 6
{'C': 4, 'A': 3, '<eos>': 5, 'G': 2, 'T': 1, '<bos>': 0}


## Dataloader and batch preparation

In [6]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cuda device
max_seq_len  200


## Model

In [7]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


## Training

In [8]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0
Train metrics: 
 Accuracy: 85.5%, Avg loss: 0.569816 

Epoch 1
Train metrics: 
 Accuracy: 86.3%, Avg loss: 0.566543 

Epoch 2
Train metrics: 
 Accuracy: 85.8%, Avg loss: 0.563771 

Epoch 3
Train metrics: 
 Accuracy: 84.6%, Avg loss: 0.571128 

Epoch 4
Train metrics: 
 Accuracy: 87.1%, Avg loss: 0.564486 

Epoch 5
Train metrics: 
 Accuracy: 87.8%, Avg loss: 0.559059 

Epoch 6
Train metrics: 
 Accuracy: 87.6%, Avg loss: 0.558720 

Epoch 7
Train metrics: 
 Accuracy: 88.4%, Avg loss: 0.557034 

Epoch 8
Train metrics: 
 Accuracy: 86.6%, Avg loss: 0.559045 

Epoch 9
Train metrics: 
 Accuracy: 88.3%, Avg loss: 0.555232 

Epoch 10
Train metrics: 
 Accuracy: 87.3%, Avg loss: 0.557306 

Epoch 11
Train metrics: 
 Accuracy: 88.7%, Avg loss: 0.559108 

Epoch 12
Train metrics: 
 Accuracy: 88.1%, Avg loss: 0.555307 

Epoch 13
Train metrics: 
 Accuracy: 88.2%, Avg loss: 0.555081 

Epoch 14
Train metrics: 
 Accuracy: 88.3%, Avg loss: 0.553821 



## Testing

In [9]:
test_dset = get_dataset_fn('test', force_download=config["force_download"], version=config["dataset_version"])
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

model.test(test_loader)

Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.cdna.all.fa.gz already exists. Skipping.
Reference /home/jupyter/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.


  0%|          | 0/190000 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

test_loss  435.92515167593956
num_batches 782
correct 21907
size 25000
Test Error: 
 Accuracy: 87.6%, Avg loss: 0.557449 



In [17]:
def export_evaluation(model, dataloader):
    size = dataloader.dataset.__len__()
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += model.loss(pred, y).item()
            correct += (torch.round(pred) == y).sum().item()

    print('test_loss ', test_loss)
    print('num_batches', num_batches)
    print('correct', correct)
    print('size', size)

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [18]:
export_evaluation(model, test_loader)

test_loss  436.8054354786873
num_batches 782
correct 21834
size 25000
Test Error: 
 Accuracy: 87.3%, Avg loss: 0.558575 

