# PYTORCH CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/torch_cnn_classifier.ipynb torch_cnn_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [1]:
# DATASET = 'no_dataset'
DATASET = 'human_ensembl_regulatory'
# DATASET = 'human_enhancers_ensembl'
VERSION = 0
BATCH_SIZE = 32
EPOCHS = 1

In [2]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

human_ensembl_regulatory 0 32 1


In [3]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from pathlib import Path

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS
from genomic_benchmarks.data_check import is_downloaded, info

## Config

In [4]:
USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS
    
config = {
    "dataset": DATASET,
    "dataset_version": VERSION,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "use_padding": USE_PADDING,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 3,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [5]:
# SEQ_PATH = Path.home() / '.genomic_benchmarks' / DATASET
# CLASSES = [x.stem for x in (SEQ_PATH/'train').iterdir() if x.is_dir()]
# print(CLASSES)

# train_dset = tf.keras.preprocessing.text_dataset_from_directory(
#     SEQ_PATH / 'train',
#     batch_size=BATCH_SIZE,
#     class_names=CLASSES)

# info(config["dataset"])
info(config["dataset"], local_repo=True)
# info('human_ensembl_regulatory', local_repo=True)

path exists  False
local_repo  True
DATASET_DIR_PATH exists  True
status code  False
_check_dataset_existence
local_repo  True
/home/davidcechak/genomic_benchmarks/datasets/human_ensembl_regulatory




Dataset `human_ensembl_regulatory` has 3 classes: enhancer, ocr, promoter.

The length of genomic intervals ranges from 71 to 802, with average 429.91753643694585 and median 401.0.

Totally 289061 sequences have been found, 231348 for training and 57713 for testing.


Unnamed: 0,train,test
enhancer,85512,21378
ocr,69902,17476
promoter,75934,18859


In [6]:
train_dset = get_dataset(config["dataset"], 'train', force_download=True, local_repo=True)
# train_dset = get_dataset("/home/davidcechak/genomic_benchmarks/datasets/human_ensembl_regulatory", 'train', force_download=True)
train_dset

<genomic_benchmarks.dataset_getters.pytorch_datasets.GenomicClfDataset at 0x7fa0fe724b90>

In [7]:
# TODO hack
train_dset.all_paths = train_dset.all_paths[:100]
train_dset.all_labels = train_dset.all_labels[:100] 
train_dset.__len__()

100

## Tokenizer and vocab

In [8]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 8
{'<pad>': 7, 'T': 5, '<eos>': 6, 'G': 4, 'A': 3, 'C': 2, '<bos>': 1, '<unk>': 0}


## Dataloader and batch preparation

In [9]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cuda device
max_seq_len  572
not all sequences are of the same length


## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

number_of_classes > 2
None
<function cross_entropy at 0x7fa10229ad40>


## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0
train  1
32
32
tensor(1.4044, device='cuda:0', grad_fn=<NllLossBackward0>)
train  2
32
32
tensor(0.3582, device='cuda:0', grad_fn=<NllLossBackward0>)
train  3
32
32
tensor(0.0650, device='cuda:0', grad_fn=<NllLossBackward0>)
train  4
4
continue
count  5
32
32
count  6
32
32
count  7
32
32
count  8
continue
Train metrics: 
 Accuracy: 0.0%, Avg loss: 0.008288 



  x = torch.tensor(pad(x), dtype=torch.long)


## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

RuntimeError: 0D or 1D target tensor expected, multi-target not supported