inspuired by
https://www.tensorflow.org/tutorials/keras/text_classification_with_hub
https://www.tensorflow.org/tutorials/keras/text_classification
https://developers.google.com/machine-learning/guides/text-classification

In [37]:
import logging
import argparse 
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger()
%config Completer.use_jedi = False # make autocompletion works in jupyter

args = argparse.Namespace()
args.data_folder = './data-ignored/imdb/'
args.val_fraction = 0.25
args.vocab_size = 2500
args.small_vocab_size = 250
args.epochs = 50
args.batch_size = 32

Path(args.data_folder).mkdir(parents=True, exist_ok=True)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir=args.data_folder)
train_ds_len= tf.data.experimental.cardinality(ds['train']).numpy()
test_ds_len= tf.data.experimental.cardinality(ds['test']).numpy() 
print(train_ds_len)
print(test_ds_len)
for d in ds['train'].take(1):
    print(d)
    
# train_dataset = ds['train'].batch(args.batch_size)
train_dataset = ds['train']
val_dataset = ds['test'].take(int(args.val_fraction * (train_ds_len + test_ds_len)))
test_dataset = ds['test'].skip(int(args.val_fraction * (train_ds_len + test_ds_len)))

2023-07-26 14:08:48,122 : INFO : No config specified, defaulting to config: imdb_reviews/plain_text
2023-07-26 14:08:48,126 : INFO : Load dataset info from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0
2023-07-26 14:08:48,129 : INFO : Reusing dataset imdb_reviews (./data-ignored/imdb/imdb_reviews/plain_text/1.0.0)
2023-07-26 14:08:48,193 : INFO : Constructing tf.data.Dataset imdb_reviews for split None, from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0


Version:  2.13.0
Eager mode:  True
GPU is NOT AVAILABLE
25000
25000
(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


2023-07-26 14:08:48.214533: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### 1. Bag of words from scratch

In [50]:
class OneTextPreprocessing(tf.keras.layers.Layer):
    def __init__(self):
        super(OneTextPreprocessing, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        print(f"Type inputs: {type(inputs)}")
        print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            byte_string = string_tensor.numpy()
            print(f"byte_string: {byte_string}")
            processed = ''.join(c for c in byte_string.decode('UTF-8').lower() if c.isalnum() or c == " ")
            print(f"type(processed): {type(processed)}")
            outputs = outputs.write(i, processed)
        stacked_output = outputs.stack()
        print(f"type(stacked_output): {type(stacked_output)}")
        print(f"tf.shape(stacked_output): {tf.shape(stacked_output)}")
        # outputs shape: B, Ragged texts
        return stacked_output

one_text_preprocessing = OneTextPreprocessing()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    one_processed = one_text_preprocessing(d[0])
    print(f"one_processed layer: {one_processed}")

one_model = keras.models.Sequential()
one_model.add(one_text_preprocessing)
for d in train_dataset.batch(2).take(1):
    one_processed = one_model(d[0])
    print(f"one_processed model: {one_processed}")

Type inputs: <class 'tensorflow.python.framework.ops.EagerTensor'>
Inputs shape: (2,)
byte_string: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
type(processed): <class 'str'>
byte_string: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comf

2023-07-26 15:14:13.603376: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-07-26 15:14:13.627926: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [53]:
class TwoTokenizer(tf.keras.layers.Layer):
    def __init__(self):
        super(TwoTokenizer, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        print(f"Type inputs: {type(inputs)}")
        print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        tokens_flat_tensor_array = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        str_len_tensor_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        start_len = 0
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            byte_string = string_tensor.numpy()
            print(f"byte_string: {byte_string}")
            string = byte_string.decode('UTF-8')
            tokens = string.split()
            for iw in tf.range(len(tokens)):
                tokens_flat_tensor_array = tokens_flat_tensor_array.write(start_len + iw, tokens[iw])
            str_len_tensor_array = str_len_tensor_array.write(i, len(tokens))
            start_len += len(tokens)
        
        ragged_tensor = tf.RaggedTensor.from_row_lengths(
                values=tokens_flat_tensor_array.stack(),
                row_lengths=str_len_tensor_array.stack())
        print(f"type(ragged_tensor): {type(ragged_tensor)}")
        print(f"tf.shape(ragged_tensor): {tf.shape(ragged_tensor)}")
        # 
        return ragged_tensor
        
two_tokenizer = TwoTokenizer()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    two_processed = two_tokenizer(d[0])
    print(f"two_processed layer: {two_processed}")

two_model = keras.models.Sequential()
two_model.add(one_text_preprocessing)
two_model.add(two_tokenizer)
for d in train_dataset.batch(2).take(1):
    two_processed = two_model(d[0])
    print(f"two_processed model: {two_processed}")

Type inputs: <class 'tensorflow.python.framework.ops.EagerTensor'>
Inputs shape: (2,)
byte_string: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
byte_string: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having

2023-07-26 15:24:13.599171: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-07-26 15:24:13.655012: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [48]:
class TwoTokenizer(tf.keras.layers.Layer):
    def __init__(self):
        super(TwoTokenizer, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        print(f"Type inputs: {type(inputs)}")
        print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        # str_len_tensor_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            byte_string = string_tensor.numpy()
            print(f"byte_string: {byte_string}")
            string = byte_string.decode('UTF-8')
            tokens = string.split()
            # str_len_tensor_array.write(i, len(tokens))
            tokens_tensor_array = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
            for iw in tf.range(len(tokens)):
                tokens_tensor_array = tokens_tensor_array.write(iw, tokens[iw])
            outputs = outputs.write(i, tokens_tensor_array.stack())
        print(f"type(outputs.stack()): {type(outputs.stack())}")
        return outputs.stack()
        
two_tokenizer = TwoTokenizer()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    two_processed = two_tokenizer(d[0])
    print(f"two_processed layer: {two_processed}")

Type inputs: <class 'tensorflow.python.framework.ops.EagerTensor'>
Inputs shape: (2,)
byte_string: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
byte_string: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having

2023-07-26 14:43:37.394727: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


ValueError: Exception encountered when calling layer 'two_tokenizer_1' (type TwoTokenizer).

Incompatible shape for value ((112,)), expected ((116,))

Call arguments received by layer 'two_tokenizer_1' (type TwoTokenizer):
  • inputs=tf.Tensor(shape=(2,), dtype=string)