# TF CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/tf_cnn_classifier.ipynb tf_cnn_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [5]:
DATASET = 'demo_coding_vs_intergenomic_seqs'
VERSION = 0
BATCH_SIZE = 64
EPOCHS = 10

In [6]:
# Parameters
DATASET = "human_ensembl_regulatory"


In [7]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

human_ensembl_regulatory 0 64 10


# Data download

In [14]:
from pathlib import Path
import tensorflow as tf
import tensorflow_addons as tfa

import numpy as np
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check import is_downloaded, info
from genomic_benchmarks.models.tf import vectorize_layer, binary_f1_score
from genomic_benchmarks.models.tf import basic_cnn_model_v0 as model

if not is_downloaded(DATASET):
    download_dataset(DATASET, local_repo=True)

In [9]:
info(DATASET, local_repo=True)

Dataset `human_ensembl_regulatory` has 3 classes: enhancer, ocr, promoter.

The length of genomic intervals ranges from 71 to 802, with average 429.91753643694585 and median 401.0.

Totally 289061 sequences have been found, 231348 for training and 57713 for testing.


Unnamed: 0,train,test
enhancer,85512,21378
ocr,69902,17476
promoter,75934,18859


## TF Dataset object

In [43]:
SEQ_PATH = Path.home() / '.genomic_benchmarks' / DATASET
CLASSES = [x.stem for x in (SEQ_PATH/'train').iterdir() if x.is_dir()]

train_dset = tf.keras.preprocessing.text_dataset_from_directory(
    SEQ_PATH / 'train',
    batch_size=BATCH_SIZE,
    class_names=CLASSES)

Found 231348 files belonging to 3 classes.


In [36]:
print(next(iter(train_dset))[1])

tf.Tensor(
[1 0 2 1 1 1 0 1 0 0 0 1 1 0 2 0 2 2 2 2 2 2 1 2 0 0 0 1 2 1 1 1 2 1 0 2 1
 0 1 1 2 2 1 2 1 1 2 2 0 2 1 1 2 1 1 1 0 1 0 0 0 2 0 0], shape=(64,), dtype=int32)


In [44]:
train_dset = train_dset.map(lambda x, y: (x, tf.one_hot(y, depth=3)))

In [45]:
print(next(iter(train_dset))[1])

tf.Tensor(
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]], shape=(64, 3), dtype=float32)


## Text vectorization

In [46]:
vectorize_layer.adapt(train_dset.map(lambda x, y: x))
# vectorize_layer.set_vocabulary(vocabulary=np.asarray(['a', 'c', 't', 'g', 'n']))
vectorize_layer.get_vocabulary()

['', '[UNK]', 't', 'a', 'g', 'c', 'n']

In [47]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)-2, label

train_ds = train_dset.map(vectorize_text)

## Model training

In [57]:
from tensorflow.keras.layers import (
    BatchNormalization,
    Conv1D,
    Dense,
    Dropout,
    GlobalAveragePooling1D,
    MaxPooling1D,
)

onehot_layer = tf.keras.layers.Lambda(lambda x: tf.one_hot(tf.cast(x, "int64"), 4))
model = tf.keras.Sequential(
    [
        onehot_layer,
        Conv1D(32, kernel_size=8, data_format="channels_last", activation="relu"),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(16, kernel_size=8, data_format="channels_last", activation="relu"),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(4, kernel_size=8, data_format="channels_last", activation="relu"),
        BatchNormalization(),
        MaxPooling1D(),
        Dropout(0.3),
        GlobalAveragePooling1D(),
        Dense(3, activation="softmax"),
    ]
)

In [58]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[tf.metrics.CategoricalAccuracy(name='acc'), tfa.metrics.F1Score(num_classes=3, average="micro")])

In [59]:
history = model.fit(
    train_ds,
    epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluation on the test set

In [60]:
test_dset = tf.keras.preprocessing.text_dataset_from_directory(
    SEQ_PATH / 'test',
    batch_size=BATCH_SIZE,
    class_names=CLASSES)

test_dset = test_dset.map(lambda x, y: (x, tf.one_hot(y, depth=3)))
test_ds =  test_dset.map(vectorize_text)

Found 57713 files belonging to 3 classes.


In [61]:
model.evaluate(test_ds)



[0.48824387788772583, 0.7956092953681946, 0.7956093549728394]