# PYTORCH CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/torch_cnn_classifier.ipynb torch_cnn_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [1]:
# DATASET = 'no_dataset'
DATASET = 'demo_human_or_worm'
VERSION = 0
BATCH_SIZE = 32
EPOCHS = 1

In [2]:
# Parameters
DATASET = "demo_human_or_worm"
EPOCHS = 10


In [3]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

demo_human_or_worm 0 32 10


## Config

In [4]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS

In [5]:
USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS
    
config = {
    "dataset": DATASET,
    "dataset_version": VERSION,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "use_padding": USE_PADDING,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
from genomic_benchmarks.loc2seq import download_dataset
download_dataset(config["dataset"])



Reference /home/katarina/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.
Downloading http://ftp.ensembl.org/pub/release-104/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz


/home/katarina/.genomic_benchmarks/fasta/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz: 30.3MB [00:07, 4.13MB/s]                            
100%|██████████| 24/24 [00:31<00:00,  1.31s/it]
7it [00:01,  6.76it/s]


PosixPath('/home/katarina/.genomic_benchmarks/demo_human_or_worm')

In [7]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [8]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 9
{'<pad>': 8, 'T': 5, 'C': 4, 'A': 3, '<eos>': 6, 'G': 2, '<bos>': 1, 'N': 7, '<unk>': 0}


## Dataloader and batch preparation

In [9]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device
max_seq_len  200
not all sequences are of the same length


## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 92.0%, Avg loss: 0.542888 

Epoch 1
Train metrics: 
 Accuracy: 91.9%, Avg loss: 0.544801 

Epoch 2
Train metrics: 
 Accuracy: 90.9%, Avg loss: 0.541808 

Epoch 3
Train metrics: 
 Accuracy: 91.5%, Avg loss: 0.542723 

Epoch 4
Train metrics: 
 Accuracy: 90.6%, Avg loss: 0.544790 

Epoch 5
Train metrics: 
 Accuracy: 93.0%, Avg loss: 0.536543 

Epoch 6
Train metrics: 
 Accuracy: 93.1%, Avg loss: 0.535183 

Epoch 7
Train metrics: 
 Accuracy: 93.7%, Avg loss: 0.533798 

Epoch 8
Train metrics: 
 Accuracy: 92.8%, Avg loss: 0.536554 

Epoch 9
Train metrics: 
 Accuracy: 93.1%, Avg loss: 0.535252 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  12500 ; tp  11324.584869861603 ; fp  656.8692091630475
recall  0.9059667895889282 ; precision  0.9451761693672059
num_batches 782
correct 23184
size 25000
Test metrics: 
 Accuracy: 0.927360, F1 score: 0.925156, Avg loss: 0.537190 



(0.92736, 0.9251562291444396)