# PYTORCH CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/torch_cnn_classifier.ipynb torch_cnn_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [1]:
# DATASET = 'no_dataset'
DATASET = 'demo_human_or_worm'
VERSION = 0
BATCH_SIZE = 64
EPOCHS = 1

In [2]:
# Parameters
DATASET = "human_ensembl_regulatory"
EPOCHS = 10


In [3]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

human_ensembl_regulatory 0 64 10


## Config

In [4]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.data_check import is_downloaded, info
from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS

## Choose the dataset

In [6]:
if not is_downloaded(DATASET):
    download_dataset(DATASET, local_repo=True)

Downloading...
From: https://drive.google.com/uc?id=1GefcGAM-tklnmzfzZ9RC91tGDmOzrAXU
To: /home/jovyan/.genomic_benchmarks/human_ensembl_regulatory.zip
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110M/110M [00:01<00:00, 87.8MB/s]


In [7]:
info(DATASET, local_repo=True)

Dataset `human_ensembl_regulatory` has 3 classes: enhancer, ocr, promoter.

The length of genomic intervals ranges from 71 to 802, with average 429.91753643694585 and median 401.0.

Totally 289061 sequences have been found, 231348 for training and 57713 for testing.


Unnamed: 0,train,test
enhancer,85512,21378
ocr,69902,17476
promoter,75934,18859


In [8]:
train_dset = get_dataset(DATASET, 'train')

In [9]:
NUM_CLASSES = len(set([train_dset[i][1] for i in range(len(train_dset))]))
NUM_CLASSES

3

## Tokenizer and vocab

In [10]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=USE_PADDING)

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 9
{'<pad>': 8, 'T': 5, '<eos>': 6, 'G': 3, 'C': 4, 'A': 2, '<bos>': 1, 'N': 7, '<unk>': 0}


## Dataloader and batch preparation

In [11]:
# Run on GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, use_padding=USE_PADDING)

# Data Loader
if(USE_PADDING):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

Using cuda device
max_seq_len  802
not all sequences are of the same length


## Model

In [12]:
model = CNN(
    number_of_classes=NUM_CLASSES,
    vocab_size=vocabulary.__len__(),
    embedding_dim=100,
    input_len=nn_input_len,
    device=device
).to(device)

## Training

In [13]:
model.fit(train_loader, epochs=EPOCHS)

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 92.5%, Avg loss: 0.186597 

Epoch 1
Train metrics: 
 Accuracy: 93.5%, Avg loss: 0.163755 

Epoch 2
Train metrics: 
 Accuracy: 93.7%, Avg loss: 0.158953 

Epoch 3
Train metrics: 
 Accuracy: 93.8%, Avg loss: 0.156300 

Epoch 4
Train metrics: 
 Accuracy: 93.9%, Avg loss: 0.154156 

Epoch 5
Train metrics: 
 Accuracy: 93.9%, Avg loss: 0.152681 

Epoch 6
Train metrics: 
 Accuracy: 94.0%, Avg loss: 0.151357 

Epoch 7
Train metrics: 
 Accuracy: 94.0%, Avg loss: 0.150706 

Epoch 8
Train metrics: 
 Accuracy: 94.0%, Avg loss: 0.149611 

Epoch 9
Train metrics: 
 Accuracy: 94.0%, Avg loss: 0.148992 



## Testing

In [14]:
test_dset = get_dataset(DATASET, 'test')
test_loader = DataLoader(test_dset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

Test metrics: 
 Accuracy: 0.933395, F1 score: 0.933395, Avg loss: 0.163150 



(0.9333945558193129, 0.9333945558193129)