In [1]:
DATASET = 'no_dataset'
VERSION = 0
BATCH_SIZE = 32
EPOCHS = 10

In [2]:
# Parameters
DATASET = "human_nontata_promoters"


In [3]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

human_nontata_promoters 0 32 10


## Config

In [4]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.models.torch_cnn import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS
from genomic_benchmarks.data_check import list_datasets, info


In [5]:
if DATASET == "human_nontata_promoters":
    VOCAB_TO_ADD = ["N"]
else:
    VOCAB_TO_ADD = []

USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS
    
config = {
    "use_padding": USE_PADDING,
    "run_on_gpu": True,
    "dataset": DATASET,
    "number_of_classes": 2,
    "dataset_version": VERSION,
    "force_download": False,
    "epochs": EPOCHS,
    "embedding_dim": 100,
    "batch_size": BATCH_SIZE,
#   vocabulary that is not present in the training set but is present in the test set
    "vocab_to_add": VOCAB_TO_ADD,
}
# check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')



Downloading 1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4 into /home/jupyter/.genomic_benchmarks/translated_human_nontata_promoters/human_nontata_promoters.zip... 

Done.
Unzipping...

Done.


## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])
if(config["vocab_to_add"]):
    for token in config["vocab_to_add"]:
        vocabulary.append_token(token)

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 7
{'N': 6, 'T': 3, '<eos>': 5, 'G': 2, 'C': 4, 'A': 1, '<bos>': 0}


## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cuda device


max_seq_len  251


## Model

In [9]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


## Training

In [10]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


Train metrics: 
 Accuracy: 80.3%, Avg loss: 0.617644 

Epoch 1


Train metrics: 
 Accuracy: 82.5%, Avg loss: 0.603673 

Epoch 2


Train metrics: 
 Accuracy: 82.2%, Avg loss: 0.600597 

Epoch 3


Train metrics: 
 Accuracy: 84.2%, Avg loss: 0.595580 

Epoch 4


Train metrics: 
 Accuracy: 82.5%, Avg loss: 0.597363 

Epoch 5


Train metrics: 
 Accuracy: 79.5%, Avg loss: 0.606323 

Epoch 6


Train metrics: 
 Accuracy: 82.5%, Avg loss: 0.597279 

Epoch 7


Train metrics: 
 Accuracy: 83.4%, Avg loss: 0.594507 

Epoch 8


Train metrics: 
 Accuracy: 84.8%, Avg loss: 0.592432 

Epoch 9


Train metrics: 
 Accuracy: 85.6%, Avg loss: 0.587404 



## Testing

In [11]:
# test_dset = get_dataset_fn('test', force_download=config["force_download"], version=config["dataset_version"])
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

model.test(test_loader)

Downloading 1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4 into /home/jupyter/.genomic_benchmarks/translated_human_nontata_promoters/human_nontata_promoters.zip... 

Done.
Unzipping...

Done.


test_loss  169.81652423739433
num_batches 283
correct 7498
size 9034
Test Error: 
 Accuracy: 83.0%, Avg loss: 0.600058 

