# PYTORCH CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/torch_cnn_classifier.ipynb torch_cnn_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [1]:
# DATASET = 'no_dataset'
DATASET = 'demo_human_or_worm'
VERSION = 0
BATCH_SIZE = 32
EPOCHS = 10
ITER = 999
PATIENCE = 1

In [2]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS, ITER, PATIENCE)

demo_human_or_worm 0 32 10 999 1


## Config

In [3]:
# must be imported first
from comet_ml import Experiment

import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS

In [4]:
USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS
    
config = {
    "dataset": DATASET,
    "dataset_version": VERSION,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "use_padding": USE_PADDING,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
    "patience": PATIENCE
}
checkpoint_name = config["dataset"] + "_" + str(ITER) + "_" 
print(checkpoint_name)

# check_config(config)

demo_human_or_worm_999_


In [5]:
experiment = Experiment(project_name="genomic-cnn", api_key="EpKIINrla6U4B4LJhd9Sv4i0b")
experiment.log_parameters(config)
experiment.set_name(checkpoint_name + "patience:" + str(config["patience"]) + "_" + "epochs:" + str(config["epochs"]))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/davidcechak/genomic-cnn/fbb010b6be0b4c4585b3cca225a3ddca



## Choose the dataset

In [6]:
import torch.utils.data as data

train_dset = get_dataset(config["dataset"], 'train')
t_size = int(len(train_dset)*0.8)
v_size = len(train_dset)-t_size

new_train_dset, valid_dset = data.random_split(train_dset, [t_size, v_size])


## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 9
{'<pad>': 8, 'T': 5, 'A': 4, 'C': 3, '<eos>': 6, 'G': 2, '<bos>': 1, 'N': 7, '<unk>': 0}


## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(new_train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)
valid_loader = DataLoader(valid_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)


Using cuda device
max_seq_len  200
not all sequences are of the same length


## Model

In [9]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

## Training

In [10]:
model.train(
    train_loader, 
    valid_loader, 
    epochs=config["epochs"], 
    patience = config["patience"], 
    checkpoint_name = checkpoint_name, 
    experiment = experiment
)

1.7976931348623157e+308
Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Valid metrics: 
 Accuracy: 91.9%, Avg loss: 0.545157 

Train metrics: 
 Accuracy: 91.9%, Avg loss: 0.541906 

new best valid loss in epoch 0 -> saving new checkpoint
Epoch 1
Valid metrics: 
 Accuracy: 90.3%, Avg loss: 0.547571 

Train metrics: 
 Accuracy: 90.1%, Avg loss: 0.544864 

not improved for epochs: 1
ending training
loading best model from epoch 0


## Testing

In [11]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader, experiment = experiment)
acc, f1

p  12500 ; tp  11415.738284111023 ; fp  947.0352419841711
recall  0.9132590627288818 ; precision  0.923396215259854
num_batches 782
correct 22997
size 25000
Test metrics: 
 Accuracy: 0.919880, F1 score: 0.918300, Avg loss: 0.542281 



(0.91988, 0.918299663722506)

In [None]:
experiment.end()