# Bag of words implementation

Try to implement a simple sentiment classifier the way that you would do it in pytorch and see how tf is compared to pytorch. 

inspuired by
https://www.tensorflow.org/tutorials/keras/text_classification_with_hub
https://www.tensorflow.org/tutorials/keras/text_classification
https://developers.google.com/machine-learning/guides/text-classification

In [1]:
# Todo control logging with debug
import logging
import argparse 
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger()
%config Completer.use_jedi = False # make autocompletion works in jupyter

args = argparse.Namespace()
args.data_folder = './data-ignored/imdb/'
args.val_fraction = 0.25
args.vocab_size = 2500
args.small_vocab_size = 250
args.epochs = 50
args.batch_size = 32

Path(args.data_folder).mkdir(parents=True, exist_ok=True)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir=args.data_folder)
train_ds_len= tf.data.experimental.cardinality(ds['train']).numpy()
test_ds_len= tf.data.experimental.cardinality(ds['test']).numpy() 
print(train_ds_len)
print(test_ds_len)
for d in ds['train'].take(1):
    print(d)
    
# train_dataset = ds['train'].batch(args.batch_size)
train_dataset = ds['train']
val_dataset = ds['test'].take(int(args.val_fraction * (train_ds_len + test_ds_len)))
test_dataset = ds['test'].skip(int(args.val_fraction * (train_ds_len + test_ds_len)))

2023-12-11 17:13:10,822 : INFO : No config specified, defaulting to config: imdb_reviews/plain_text
2023-12-11 17:13:10,824 : INFO : Load dataset info from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0
2023-12-11 17:13:10,826 : INFO : Reusing dataset imdb_reviews (./data-ignored/imdb/imdb_reviews/plain_text/1.0.0)
2023-12-11 17:13:10,875 : INFO : Constructing tf.data.Dataset imdb_reviews for split None, from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0


Version:  2.13.0
Eager mode:  True
GPU is NOT AVAILABLE
25000
25000
(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


2023-12-11 17:13:10.896311: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## 1. Bag of words from scratch

In [2]:
class OneTextPreprocessing(tf.keras.layers.Layer):
    def __init__(self):
        super(OneTextPreprocessing, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        # print(f"Type inputs: {type(inputs)}")
        # print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            byte_string = string_tensor.numpy()
            # print(f"byte_string: {byte_string}")
            processed = ''.join(c for c in byte_string.decode('UTF-8').lower() if c.isalnum() or c == " ")
            # print(f"type(processed): {type(processed)}")
            outputs = outputs.write(i, processed)
        stacked_output = outputs.stack()
        # print(f"type(stacked_output): {type(stacked_output)}")
        # print(f"tf.shape(stacked_output): {tf.shape(stacked_output)}")
        # outputs shape: B, Ragged texts
        return stacked_output

one_text_preprocessing = OneTextPreprocessing()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    one_processed = one_text_preprocessing(d[0])
    print(f"one_processed layer: {one_processed}")

one_model = keras.models.Sequential()
one_model.add(one_text_preprocessing)
for d in train_dataset.batch(2).take(1):
    one_processed = one_model(d[0])
    print(f"one_processed model: {one_processed}")

one_processed layer: [b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the columbian rebels were making their cases for revolutions maria conchita alonso appeared phony and her pseudolove affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actors like christopher walkens good name i could barely sit through it'
 b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the sette and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development wa

2023-12-11 17:13:10.943827: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:13:10.967843: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [3]:
class TwoTokenizer(tf.keras.layers.Layer):
    def __init__(self):
        super(TwoTokenizer, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        # print(f"Type inputs: {type(inputs)}")
        # print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        tokens_flat_tensor_array = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        str_len_tensor_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        start_len = 0
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            byte_string = string_tensor.numpy()
            # print(f"byte_string: {byte_string}")
            string = byte_string.decode('UTF-8')
            tokens = string.split()
            for iw in tf.range(len(tokens)):
                tokens_flat_tensor_array = tokens_flat_tensor_array.write(start_len + iw, tokens[iw])
            str_len_tensor_array = str_len_tensor_array.write(i, len(tokens))
            start_len += len(tokens)
        
        ragged_tensor = tf.RaggedTensor.from_row_lengths(
                values=tokens_flat_tensor_array.stack(),
                row_lengths=str_len_tensor_array.stack())
        # print(f"type(ragged_tensor): {type(ragged_tensor)}")
        # print(f"tf.shape(ragged_tensor): {tf.shape(ragged_tensor)}")
        # outputs shape: Ragged tensor: B, Tokens
        return ragged_tensor
        
two_tokenizer = TwoTokenizer()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    two_processed = two_tokenizer(d[0])
    print(f"two_processed layer: {two_processed}")

two_model = keras.models.Sequential()
two_model.add(one_text_preprocessing)
two_model.add(two_tokenizer)
for d in train_dataset.batch(2).take(1):
    two_processed = two_model(d[0])
    print(f"two_processed model: {two_processed}")

2023-12-11 17:13:11.046598: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


two_processed layer: <tf.RaggedTensor [[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie.', b"Don't",
  b'be', b'lured', b'in', b'by', b'Christopher', b'Walken', b'or',
  b'Michael', b'Ironside.', b'Both', b'are', b'great', b'actors,', b'but',
  b'this', b'must', b'simply', b'be', b'their', b'worst', b'role', b'in',
  b'history.', b'Even', b'their', b'great', b'acting', b'could', b'not',
  b'redeem', b'this', b"movie's", b'ridiculous', b'storyline.', b'This',
  b'movie', b'is', b'an', b'early', b'nineties', b'US', b'propaganda',
  b'piece.', b'The', b'most', b'pathetic', b'scenes', b'were', b'those',
  b'when', b'the', b'Columbian', b'rebels', b'were', b'making', b'their',
  b'cases', b'for', b'revolutions.', b'Maria', b'Conchita', b'Alonso',
  b'appeared', b'phony,', b'and', b'her', b'pseudo-love', b'affair',
  b'with', b'Walken', b'was', b'nothing', b'but', b'a', b'pathetic',
  b'emotional', b'plug', b'in', b'a', b'movie', b'that', b'was', b'devoid',
  b'of', b'any', b'rea

2023-12-11 17:13:11.102266: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [4]:
from collections import Counter 
import functools

class ThreeVectorizer(tf.keras.layers.Layer):
    def __init__(self, token_to_int: dict):
        super(ThreeVectorizer, self).__init__()
        self._token_to_int = token_to_int
        self._vocab_size = len(self._token_to_int) + 1
    
    def call(self, inputs):  
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            tokens = inputs[i]
            array_string = tf.TensorArray(dtype=tf.float32, size=self._vocab_size)
            array_string.unstack(tf.zeros(self._vocab_size))
            for token in tokens:
                idx = self._token_to_int.get(token.numpy(), tf.constant(0))
                array_string = array_string.write(idx, array_string.read(idx) + 1.0)
            outputs = outputs.write(i, array_string.stack())
        return outputs.stack()

    @classmethod
    @functools.lru_cache(maxsize=10)
    def from_train_data(cls, train_ds, vocab_size=args.small_vocab_size, batch_size=args.batch_size, take=5):
        _one_text_preprocessing = OneTextPreprocessing()
        _two_tokenizer = TwoTokenizer()
        _preproc_mdl = keras.models.Sequential()
        _preproc_mdl.add(_one_text_preprocessing)
        _preproc_mdl.add(_two_tokenizer)

        _counter = Counter()
        for d in train_ds.batch(batch_size).take(take):
            _processed = _preproc_mdl(d[0])
            for b in _processed:
                _counter.update(b.numpy().tolist())

        # +-1 cause we need space for unknown tokens with 0 index
        _token_dict = {k: tf.cast(i + 1, tf.int32) for i, (k, _) in enumerate(_counter.most_common(vocab_size - 1))}
        return cls(_token_dict)

three_vectorizer = ThreeVectorizer.from_train_data(train_dataset)

three_model = keras.models.Sequential()
three_model.add(one_text_preprocessing)
three_model.add(two_tokenizer)
three_model.add(three_vectorizer)
for d in train_dataset.batch(2).take(1):
    three_processed = three_model(d[0])
    print(f"three_processed model: {three_processed}")

three_model.add(tf.keras.layers.BatchNormalization(axis=1))
for d in train_dataset.batch(2).take(1):
    three_processed = three_model(d[0], training=True) # batch normalization works while training only
    print(f"three_processed model with batch normalization: {three_processed}")

three_model.pop()
three_model.add(tf.keras.layers.LayerNormalization(axis=1))
for d in train_dataset.batch(2).take(1):
    three_processed = three_model(d[0])
    print(f"three_processed model with layer normalization: {three_processed}")

2023-12-11 17:13:14.823473: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:13:15.794995: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:13:15.885284: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

three_processed model: [[43.  2.  1.  2.  1.  0.  1.  3.  5.  1.  2.  2.  0.  1.  0.  1.  3.  0.
   3.  2.  0.  0.  2.  0.  0.  2.  0.  1.  0.  2.  0.  0.  1.  0.  0.  0.
   1.  0.  0.  0.  0.  1.  2.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.
   3.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.
   0.  1.  0.  0.  2.  2.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  2.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  

2023-12-11 17:13:16.026205: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [5]:
four_model = keras.models.Sequential()
four_model.add(one_text_preprocessing)
four_model.add(two_tokenizer)
four_model.add(three_vectorizer)
four_model.add(tf.keras.layers.BatchNormalization(axis=-1))
four_model.add(tf.keras.layers.Dense(64, activation='relu'))
four_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

for d in train_dataset.batch(2).take(1):
    four_processed = four_model(d[0], training=True) # batch normalization works while training only
    print(f"four_processed model with batch normalization: {four_processed}")

four_processed model with batch normalization: [[0.21752222]
 [0.7006044 ]]


2023-12-11 17:13:16.113415: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [6]:
five_model = keras.models.Sequential()
five_model.add(one_text_preprocessing)
five_model.add(two_tokenizer)
five_model.add(three_vectorizer)
five_model.add(tf.keras.layers.BatchNormalization(axis=-1))
five_model.add(tf.keras.layers.Dense(64, activation='relu'))
five_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# instead of model.build() we can just call the model on some data
for d in train_dataset.batch(args.batch_size).take(1):
    five_model(d[0])
five_model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3), loss='binary_crossentropy', 
                   metrics=['accuracy'], run_eagerly=True)
five_model.summary()

ds_train = train_dataset.shuffle(args.batch_size * 10).batch(args.batch_size).prefetch(1)
ds_val = val_dataset.batch(args.batch_size).prefetch(1)
monitor='val_loss'
early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=3, mode='auto', restore_best_weights=True, verbose=1)
reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=2, min_delta=1e-4, mode='auto', verbose=1)

history = five_model.fit(ds_train, validation_data=ds_val, epochs=args.epochs, callbacks=[early_stopping, reduce_lr_on_plateau])

print('Val_accuracy:', max(history.history['val_accuracy']))
print('Val_loss:', min(history.history['val_loss']))
print('Accuracy:', max(history.history['accuracy']))

# 782/782 [==============================] - 2417s 3s/step - loss: 0.4725 - accuracy: 0.7740 - val_loss: 0.4740 - val_accuracy: 0.7734 - lr: 0.0010

2023-12-11 17:13:16.214471: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 one_text_preprocessing (On  multiple                  0         
 eTextPreprocessing)                                             
                                                                 
 two_tokenizer (TwoTokenize  multiple                  0         
 r)                                                              
                                                                 
 three_vectorizer (ThreeVec  multiple                  0         
 torizer)                                                        
                                                                 
 batch_normalization_2 (Bat  multiple                  1000      
 chNormalization)                                                
                                                                 
 dense_2 (Dense)             multiple                 

KeyboardInterrupt: 

### Problems occurred with implementing it
- Using TensorArray and other special TF types to support different sizes of tensors
- When trying to train (model.fit()):
  AttributeError: 'Tensor' object has no attribute 'numpy'
    
    
    Call arguments received by layer 'one_text_preprocessing' (type OneTextPreprocessing):
      • inputs=tf.Tensor(shape=(None,), dtype=string)
    on calling numpy() on a Tensor when model.fit().  During just __call__() on the layers and the model, this didn't happen

  Solution from https://stackoverflow.com/questions/52357542/attributeerror-tensor-object-has-no-attribute-numpy: This can also happen in TF2.0 if your code is wrapped in a @tf.function or inside a Keras layer (my case). Both of those run in graph mode. There's a lot of secretly broken code out of there because behavior differs between eager and graph modes and people are not aware that they're switching contexts, so be careful!
  The issue seems to be that for certain functions during the fitting model.fit() the @tf.function decorator prohibits the execution of functions like tensor.numpy() for performance reasons.
  
    The solution for me was to pass the flag run_eagerly=True to the model.compile() like this model.compile(..., run_eagerly=True)


In [7]:
tl = tf.keras.layers.BatchNormalization(axis=1)

tl(tf.constant([[1, 3], [2, 4]]), training=True)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-0.998006  , -0.99800634],
       [ 0.9980061 ,  0.99800587]], dtype=float32)>

## 2. Bag of words from scratch in graph mode

In [8]:
tf.executing_eagerly()

True

In [9]:
class SixTextPreprocessing(tf.keras.layers.Layer):
    def __init__(self):
        super(SixTextPreprocessing, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        # print(f"Type inputs: {type(inputs)}")
        # print(f"Inputs shape: {inputs.shape}")
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            lower_string = tf.strings.lower(string_tensor)
            # print(lower_string)
            processed = tf.strings.regex_replace(lower_string, "[^\w\s]", "", replace_global=True, name=None)
            # print(processed)
            outputs = outputs.write(i, processed)
        stacked_output = outputs.stack()
        # print(f"type(stacked_output): {type(stacked_output)}")
        # print(f"tf.shape(stacked_output): {tf.shape(stacked_output)}")
        # outputs shape: B, Ragged texts
        return stacked_output

six_text_preprocessing = SixTextPreprocessing()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    six_processed = six_text_preprocessing(d[0])
    print(f"six_processed layer: {six_processed};")

six_model = keras.models.Sequential()
six_model.add(six_text_preprocessing)
for d in train_dataset.batch(2).take(1):
    six_processed = six_model(d[0])
    print(f"six_processed model: {six_processed};")

@tf.function
def run_six_graph(inp):
    six_processed_graph = six_model(inp)
    tf.print(f"six_processed model graph inside tf function: {six_processed_graph};")
    return six_processed_graph

for d in train_dataset.batch(2).take(1):
    six_processed_graph = run_six_graph(d[0])
    print(f"six_processed model graph outside of tf function: {six_processed_graph};")

six_processed layer: [b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the columbian rebels were making their cases for revolutions maria conchita alonso appeared phony and her pseudolove affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actors like christopher walkens good name i could barely sit through it'
 b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the sette and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development wa

2023-12-11 17:14:05.828507: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:14:05.855769: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:14:05.921972: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

In [10]:
class SevenTokenizer(tf.keras.layers.Layer):
    def __init__(self):
        super(SevenTokenizer, self).__init__()
    
    def call(self, inputs):
        # inputs shape: B, Ragged texts
        batch_size = tf.shape(inputs)[0]
        tokens_flat_tensor_array = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)
        str_len_tensor_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        start_len = 0
        for i in tf.range(batch_size):
            string_tensor = inputs[i]
            tokens_ragged_tensor = tf.strings.split(string_tensor, " ")
            for iw in tf.range(tf.size(tokens_ragged_tensor)):
                tokens_flat_tensor_array = tokens_flat_tensor_array.write(start_len + iw, tokens_ragged_tensor[iw])
            str_len_tensor_array = str_len_tensor_array.write(i, tf.size(tokens_ragged_tensor))
            start_len += tf.size(tokens_ragged_tensor)
        
        ragged_tensor = tf.RaggedTensor.from_row_lengths(
                values=tokens_flat_tensor_array.stack(),
                row_lengths=str_len_tensor_array.stack())
        # print(f"type(ragged_tensor): {type(ragged_tensor)}")
        # print(f"tf.shape(ragged_tensor): {tf.shape(ragged_tensor)}")
        # outputs shape: Ragged tensor: B, Tokens
        return ragged_tensor
        
seven_tokenizer = SevenTokenizer()

# for d in train_dataset.batch(args.batch_size).take(1):
for d in train_dataset.batch(2).take(1):
    seven_processed = seven_tokenizer(d[0])
    print(f"seven_processed layer: {seven_processed};")

seven_model = keras.models.Sequential()
seven_model.add(six_text_preprocessing)
seven_model.add(seven_tokenizer)
for d in train_dataset.batch(2).take(1):
    seven_processed = seven_model(d[0])
    print(f"seven_processed model: {seven_processed};")

@tf.function
def run_seven_graph(inp):
    seven_processed_graph = seven_model(inp)
    tf.print(f"seven_processed model graph inside tf function: {seven_processed_graph};")
    return seven_processed_graph

for d in train_dataset.batch(2).take(1):
    seven_processed_graph = run_seven_graph(d[0])
    print(f"seven_processed model graph outside of tf function: {seven_processed_graph};")

2023-12-11 17:14:08.952405: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:14:09.037886: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


seven_processed layer: <tf.RaggedTensor [[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie.', b"Don't",
  b'be', b'lured', b'in', b'by', b'Christopher', b'Walken', b'or',
  b'Michael', b'Ironside.', b'Both', b'are', b'great', b'actors,', b'but',
  b'this', b'must', b'simply', b'be', b'their', b'worst', b'role', b'in',
  b'history.', b'Even', b'their', b'great', b'acting', b'could', b'not',
  b'redeem', b'this', b"movie's", b'ridiculous', b'storyline.', b'This',
  b'movie', b'is', b'an', b'early', b'nineties', b'US', b'propaganda',
  b'piece.', b'The', b'most', b'pathetic', b'scenes', b'were', b'those',
  b'when', b'the', b'Columbian', b'rebels', b'were', b'making', b'their',
  b'cases', b'for', b'revolutions.', b'Maria', b'Conchita', b'Alonso',
  b'appeared', b'phony,', b'and', b'her', b'pseudo-love', b'affair',
  b'with', b'Walken', b'was', b'nothing', b'but', b'a', b'pathetic',
  b'emotional', b'plug', b'in', b'a', b'movie', b'that', b'was', b'devoid',
  b'of', b'any', b'r

2023-12-11 17:14:09.287497: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [11]:
from collections import Counter 
import functools
from tensorflow.lookup import KeyValueTensorInitializer, StaticHashTable

class EightVectorizer(tf.keras.layers.Layer):
    # +-1 cause we need space for unknown tokens with 0 index
    def __init__(self, token_to_int: StaticHashTable):
        super(EightVectorizer, self).__init__()
        self._token_to_int = token_to_int
        self._vocab_size = tf.cast(self._token_to_int.size() + 1, tf.int32)
    
    def call(self, inputs):  
        batch_size = tf.shape(inputs)[0]
        outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(batch_size):
            tokens = inputs[i]
            array_string = tf.TensorArray(dtype=tf.float32, size=self._vocab_size)
            # array_string = tf.TensorArray(dtype=tf.float32, size=250)
            array_string.unstack(tf.zeros(self._vocab_size))
            # array_string.unstack(tf.zeros(250))
            # for itoken in tf.range(tf.size(
            indexes = self._token_to_int.lookup(tokens)
            for idx in indexes:
                array_string = array_string.write(idx, array_string.read(idx) + 1.0)
            outputs = outputs.write(i, array_string.stack())
        return outputs.stack()

    @classmethod
    @functools.lru_cache(maxsize=10)
    def from_train_data(cls, train_ds, vocab_size=args.small_vocab_size, batch_size=args.batch_size, take=5):
        _six_text_preprocessing = SixTextPreprocessing()
        _seven_tokenizer = SevenTokenizer()
        _preproc_mdl = keras.models.Sequential()
        _preproc_mdl.add(_six_text_preprocessing)
        _preproc_mdl.add(_seven_tokenizer)

        _counter = Counter()
        for d in train_ds.batch(batch_size).take(take):
            _processed = _preproc_mdl(d[0])
            for ragged_tensor in _processed:
                _counter.update(ragged_tensor.numpy().tolist())
        keys_tensor = tf.constant([t for t, _ in _counter.most_common(vocab_size - 1)])
        # print(f"keys_tensor: {keys_tensor}")
        vals_tensor = tf.constant([i for i in range(1, vocab_size)])
        # print(f"vals_tensor: {vals_tensor}")
        _token_table = StaticHashTable(KeyValueTensorInitializer(keys_tensor, vals_tensor), default_value=0)
        return cls(_token_table)

eight_vectorizer = EightVectorizer.from_train_data(train_dataset)
print(eight_vectorizer._vocab_size)

eight_model = keras.models.Sequential()
eight_model.add(six_text_preprocessing)
eight_model.add(seven_tokenizer)
eight_model.add(eight_vectorizer)
for d in train_dataset.batch(2).take(1):
    eight_processed = eight_model(d[0])
    print(f"eight_processed model: {eight_processed};")

eight_model.add(tf.keras.layers.BatchNormalization(axis=1))
for d in train_dataset.batch(2).take(1):
    eight_processed = eight_model(d[0], training=True) # batch normalization works while training only
    print(f"eight_processed model with batch normalization: {eight_processed};")

eight_model.pop()
eight_model.add(tf.keras.layers.LayerNormalization(axis=1))
for d in train_dataset.batch(2).take(1):
    eight_processed = eight_model(d[0])
    print(f"eight_processed model with layer normalization: {eight_processed};")

eight_model.pop()
@tf.function
def run_eight_graph(inp):
    eight_processed_graph = eight_model(inp)
    tf.print(f"eight_processed model graph inside tf function: {eight_processed_graph};")
    return eight_processed_graph

for d in train_dataset.batch(2).take(1):
    eight_processed_graph = run_eight_graph(d[0])
    print(f"eight_processed model graph outside of tf function: {eight_processed_graph};")

2023-12-11 17:14:30.154661: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


tf.Tensor(250, shape=(), dtype=int32)
eight_processed model: [[43.  2.  1.  2.  1.  0.  1.  3.  5.  1.  2.  2.  0.  1.  0.  1.  3.  0.
   3.  2.  0.  0.  2.  0.  0.  2.  0.  1.  0.  0.  2.  0.  0.  1.  0.  0.
   0.  1.  0.  0.  0.  0.  1.  2.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.
   0.  3.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.
   0.  0.  1.  0.  0.  2.  2.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   2.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

2023-12-11 17:14:31.670292: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:14:31.870528: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


eight_processed model with batch normalization: [[ 0.9999199  -0.9997778  -0.9997779   0.         -0.99987507 -0.99987507
   0.          0.99977785  0.9995005   0.         -0.99987507  0.99950033
   0.          0.          0.          0.          0.         -0.99977785
   0.99977785 -0.99800587  0.          0.          0.99950033 -0.99950033
   0.          0.99950033 -0.99950033  0.998006    0.         -0.998006
   0.99950033  0.         -0.998006    0.998006    0.         -0.99950033
   0.          0.998006   -0.998006    0.          0.          0.
   0.          0.99950033  0.          0.          0.         -0.998006
   0.998006   -0.998006    0.          0.998006    0.998006    0.
   0.          0.99977785  0.          0.          0.         -0.998006
   0.          0.998006    0.          0.          0.          0.
   0.          0.          0.          0.          0.99950033  0.
   0.          0.          0.998006    0.          0.          0.99950033
   0.99950033  0.          0

2023-12-11 17:14:31.969658: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-11 17:14:32.059611: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


eight_processed model graph inside tf function: Tensor("sequential_9/eight_vectorizer/TensorArrayV2Stack/TensorListStack:0", shape=(None, 250), dtype=float32);
eight_processed model graph outside of tf function: [[43.  2.  1.  2.  1.  0.  1.  3.  5.  1.  2.  2.  0.  1.  0.  1.  3.  0.
   3.  2.  0.  0.  2.  0.  0.  2.  0.  1.  0.  0.  2.  0.  0.  1.  0.  0.
   0.  1.  0.  0.  0.  0.  1.  2.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.
   0.  3.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.
   0.  0.  1.  0.  0.  2.  2.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   2.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

## Problems faced when implementing the  in graph mode
1. Can use only tf methods and functions, no numpy or python func. If you lack some function in tf, it might be difficult to implement something.
2. To run it graph mode, you need to wrap a model in a tf fucntion and run a tf.function. Can't just enable graph mode to be usef everywhere.
3. Couldn't make it print a tensor value inside tf.function neither with print() nor with tf.print() it returns something like Tensor("sequential_4/six_text_preprocessing_4/TensorArrayV2Stack/TensorListStack:0", shape=(None,), dtype=string) instead of a real value
4. AttributeError: 'Tensor' object has no attribute 'numpy'
5. When self._vocab_size = self._token_to_int.size() + 1 (Tensor int64) got an error AssertionError: Unreachable
    
    
    Call arguments received by layer 'eight_vectorizer_16' (type EightVectorizer):
      • inputs=tf.RaggedTensor(values=Tensor("sequential_39/seven_tokenizer_2/TensorArrayV2Stack/TensorListStack:0", shape=(None,), dtype=string), row_splits=Tensor("sequential_39/seven_tokenizer_2/RaggedFromRowLengths/control_dependency:0", shape=(None,), dtype=int32))
   needed to cast:
   self._vocab_size = tf.cast(self._token_to_int.size() + 1, tf.int32)