<a href="https://colab.research.google.com/github/ML-Bioinfo-CEITEC/ECCB2021/blob/main/notebooks/Practise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Reading data

In [1]:
import urllib.request
import zipfile
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
urllib.request.urlretrieve("https://github.com/ML-Bioinfo-CEITEC/ECCB2021/raw/main/data/intergenic_vs_coding_20000_seqs.zip", "intergenic_vs_coding_20000_seqs.zip")
with zipfile.ZipFile("intergenic_vs_coding_20000_seqs.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [3]:
batch_size = 64

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'intergenic_vs_coding_10000_seqs/train/',
    batch_size=64,
    class_names=["intergenomic", "transcripts"])

raw_valid_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'intergenic_vs_coding_10000_seqs/valid/',
    batch_size=64,
    class_names=["intergenomic", "transcripts"])

Found 18335 files belonging to 2 classes.
Found 1665 files belonging to 2 classes.


In [4]:
text_batch, label_batch = next(iter(raw_train_ds))
text_batch[0], label_batch[0]

(<tf.Tensor: shape=(), dtype=string, numpy=b'TGCCCTAGAGTTCTGGGGGTGGCTACAGACACAACAAACACTGAGACGATGAAACTGACAGTAATTGATTAGTTACTCATACTCAGCCCAGGGGAGGATGCTGCTGCATCTCTCCAGATCTATGTGGGGGTTGCACTCAGGAACAGAAAGAACAAGCAGAGGGTGTGGGAGGCAGGCTTGGGAGTCATCAGAGGGTGGGT'>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [5]:
def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')

vectorize_layer = TextVectorization(output_mode='int', split=char_split)

In [6]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
vectorize_layer.set_vocabulary(vocabulary=np.asarray(['a', 'c', 't', 'g', 'n']))

In [7]:
vectorize_layer.get_vocabulary()

['', '[UNK]', 'a', 'c', 't', 'g', 'n']

In [8]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  vectorized_text = vectorize_layer(text)
  onehot_text = tf.one_hot(tf.cast(vectorized_text-2,'int64'), 4)
  return onehot_text, label

In [9]:
text_batch[0], label_batch[0]

(<tf.Tensor: shape=(), dtype=string, numpy=b'TGCCCTAGAGTTCTGGGGGTGGCTACAGACACAACAAACACTGAGACGATGAAACTGACAGTAATTGATTAGTTACTCATACTCAGCCCAGGGGAGGATGCTGCTGCATCTCTCCAGATCTATGTGGGGGTTGCACTCAGGAACAGAAAGAACAAGCAGAGGGTGTGGGAGGCAGGCTTGGGAGTCATCAGAGGGTGGGT'>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [10]:
first_seq, first_label = text_batch[0], label_batch[0]
vectorize_text(first_seq, first_label)

(<tf.Tensor: shape=(1, 200, 4), dtype=float32, numpy=
 array([[[0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
 

In [11]:
train_ds = raw_train_ds.map(vectorize_text)
valid_ds = raw_valid_ds.map(vectorize_text)

## Model & Training

In [12]:
## YOUR WORK