In [1]:
import logging
import argparse 
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger()
%config Completer.use_jedi = False # make autocompletion works in jupyter

args = argparse.Namespace()
args.data_folder = './data-ignored/imdb/'
args.val_fraction = 0.25
args.vocab_size = 2500
args.small_vocab_size = 250
args.epochs = 50
args.batch_size = 32

Path(args.data_folder).mkdir(parents=True, exist_ok=True)

ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir=args.data_folder)
train_ds_len= tf.data.experimental.cardinality(ds['train']).numpy()
test_ds_len= tf.data.experimental.cardinality(ds['test']).numpy() 
print(train_ds_len)
for d in ds['train'].take(1):
    print(d)
    
# train_dataset = ds['train'].batch(args.batch_size)
train_dataset = ds['train']
val_dataset = ds['test'].take(int(args.val_fraction * (train_ds_len + test_ds_len)))
test_dataset = ds['test'].skip(int(args.val_fraction * (train_ds_len + test_ds_len)))

2021-12-18 14:26:37,249 : INFO : No config specified, defaulting to first: imdb_reviews/plain_text
2021-12-18 14:26:37,251 : INFO : Load dataset info from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0
2021-12-18 14:26:37,255 : INFO : Reusing dataset imdb_reviews (./data-ignored/imdb/imdb_reviews/plain_text/1.0.0)
2021-12-18 14:26:37,256 : INFO : Constructing tf.data.Dataset imdb_reviews for split None, from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0


25000
(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


2021-12-18 14:26:37.258703: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-18 14:26:37.391021: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Baseline

In [2]:
import functools

@functools.lru_cache(maxsize=10)
def get_encoder(vocab_size=args.vocab_size):
    encoder = TextVectorization(max_tokens=vocab_size)
    encoder.adapt(train_dataset.map(lambda text, label: text))
    return encoder

In [3]:
print('fff', max([1, 2]))

fff 2


In [None]:
### Baseline. Bag of words. Preprocessing in dataset creation step.

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def baseline_bag_of_words():
    encoder = get_encoder()
    
    # declaring outputs as an input cause you have to declare tf.variable outside of tf function 
    def build_bag_of_words(tokens, label, outputs=tf.Variable(tf.zeros(args.small_vocab_size))):
        # without it, tf saves the last state of the tensor
        outputs.assign(tf.zeros_like(outputs))
        for i in range(len(tokens)):
            output_idx = tokens[i]
            if output_idx >= tf.constant(args.small_vocab_size, dtype=tf.int64):
                output_idx = tf.constant(1, dtype=tf.int64)
            outputs[output_idx].assign(outputs[output_idx] + 1)
        return outputs, label

    ds_train = train_dataset.map(lambda sent, l: (get_encoder()(sent), l)).map(build_bag_of_words).cache().shuffle(10000).batch(args.batch_size)
    ds_val = val_dataset.map(lambda sent, l: (get_encoder()(sent), l)).map(build_bag_of_words).cache().shuffle(10000).batch(args.batch_size)
    # for d in ds_train.take(10):
    #     print(d)
        
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(250,)))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    model.summary()

    
    monitor='val_loss'
    early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=3, mode='auto', restore_best_weights=True, verbose=1)
    reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=2, min_delta=1e-4, mode='auto', verbose=1)
    
    history = model.fit(ds_train, validation_data=ds_val, epochs=args.epochs, callbacks=[early_stopping, reduce_lr_on_plateau])
    
    print('Val_accuracy:', max(history.history['val_accuracy']))
    print('Val_loss:', min(history.history['val_loss']))
    print('Accuracy:', max(history.history['accuracy']))
    
baseline_bag_of_words()

# val_accuracy: 0.785; val_loss 0.456; accuracy: 0.803

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                16064     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 16,129
Trainable params: 16,129
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50


2021-12-18 14:26:58.443117: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1361 of 10000
2021-12-18 14:27:08.438771: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2704 of 10000
2021-12-18 14:27:18.438015: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 4027 of 10000
2021-12-18 14:27:28.434088: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 5397 of 10000
2021-12-18 14:27:38.447168: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 6733 of 10000
2021-12-18 14:27:48.436308: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 8051 of 10000
2021-12-18 14:27:58.435892: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling 



2021-12-18 14:30:07.129295: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1390 of 10000
2021-12-18 14:30:17.122027: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2832 of 10000
2021-12-18 14:30:27.125266: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 4198 of 10000
2021-12-18 14:30:37.120423: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 5567 of 10000
2021-12-18 14:30:47.121443: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 6924 of 10000
2021-12-18 14:30:57.132319: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 8320 of 10000
2021-12-18 14:31:07.125631: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling 



2021-12-18 14:31:27.507590: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/50
Epoch 00008: early stopping
Val_accuracy: 0.7851999998092651
Val_loss: 0.46305057406425476
Accuracy: 0.8302800059318542


In [9]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def baseline_bag_of_words1():
    
    class BagOfWords(tf.keras.layers.Layer):
        def __init__(self, vocab_size=args.small_vocab_size, batch_size=args.batch_size):
            super(BagOfWords, self).__init__()
            self.vocab_size = vocab_size
            self.batch_size = batch_size

        def build(self, input_shape):
            super().build(input_shape)

        def call(self, inputs):
            print('trace', tf.shape(inputs))
            tf.print('exec', tf.shape(inputs))
            if inputs.shape[-1] == None:
                return tf.constant(np.zeros([32, 250]))
                # return tf.shape(inputs)
            outputs_np = np.zeros([self.batch_size, self.vocab_size])
            if inputs.shape[-1] != None:
                for i in range(inputs.shape[0]):
                    for ii in range(inputs.shape[-1]):
                        ouput_idx = inputs[i][ii]
                        outputs_np[i][ouput_idx] = outputs_np[i][ouput_idx] + 1
            return tf.constant(outputs_np)

    encoder = get_encoder(args.small_vocab_size)
    bag_of_words = BagOfWords(args.small_vocab_size, args.batch_size)
    
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(None,), dtype=tf.string))
    model.add(encoder)
    model.add(bag_of_words)
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    monitor='val_loss'
    early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=10, mode='auto', restore_best_weights=True, verbose=1)
    reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=3, min_delta=1e-4, mode='auto', verbose=1)
    
    model.fit(train_dataset.batch(32), epochs=args.epochs)

baseline_bag_of_words1()

trace Tensor("bag_of_words_3/Shape:0", shape=(2,), dtype=int32)
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 bag_of_words_3 (BagOfWords)  (32, 250)                0         
                                                                 
 dense_6 (Dense)             (32, 64)                  16064     
                                                                 
 dense_7 (Dense)             (32, 1)                   65        
                                                                 
Total params: 16,129
Trainable params: 16,129
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50




trace Tensor("sequential_4/bag_of_words_3/Shape:0", shape=(2,), dtype=int32)




trace Tensor("sequential_4/bag_of_words_3/Shape:0", shape=(2,), dtype=int32)
exec [32 552]
  1/782 [..............................] - ETA: 6:18 - loss: 0.6931 - accuracy: 0.5312exec [32 707]
exec [32 861]
exec [32 699]
exec [32 626]
exec [32 477]
exec [32 650]
exec [32 424]
exec [32 790]
exec [32 801]
exec [32 645]
exec [32 860]
exec [32 926]
exec [32 495]
exec [32 880]
exec [32 613]
exec [32 582]
exec [32 509]
 18/782 [..............................] - ETA: 2s - loss: 0.6932 - accuracy: 0.5052  exec [32 765]
exec [32 763]
exec [32 550]
exec [32 956]
exec [32 774]
exec [32 426]
exec [32 682]
exec [32 832]
exec [32 768]
exec [32 671]
exec [32 988]
exec [32 883]
exec [32 774]
exec [32 898]
exec [32 950]
exec [32 713]
exec [32 805]
 35/782 [>.............................] - ETA: 2s - loss: 0.6932 - accuracy: 0.4991exec [32 688]
exec [32 620]
exec [32 986]
exec [32 840]
exec [32 807]
exec [32 603]
exec [32 384]
exec [32 760]
exec [32 888]
exec [32 1146]
exec [32 533]
exec [32 857]
exec [32

InvalidArgumentError:  Incompatible shapes: [8,1] vs. [32,1]
	 [[node gradient_tape/binary_crossentropy/logistic_loss/mul/Mul
 (defined at /Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py:464)
]] [Op:__inference_train_function_140635]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/binary_crossentropy/logistic_loss/mul/Mul:
In[0] gradient_tape/binary_crossentropy/logistic_loss/sub/Neg:	
In[1] binary_crossentropy/Cast (defined at /Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/losses.py:1797)

Operation defined at: (most recent call last)
>>>   File "/Users/mkhokhlush/.pyenv/versions/3.8.6/lib/python3.8/runpy.py", line 194, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "/Users/mkhokhlush/.pyenv/versions/3.8.6/lib/python3.8/runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
>>>     app.start()
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
>>>     self.io_loop.start()
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "/Users/mkhokhlush/.pyenv/versions/3.8.6/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
>>>     self._run_once()
>>> 
>>>   File "/Users/mkhokhlush/.pyenv/versions/3.8.6/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
>>>     handle._run()
>>> 
>>>   File "/Users/mkhokhlush/.pyenv/versions/3.8.6/lib/python3.8/asyncio/events.py", line 81, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
>>>     await self.process_one()
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 446, in process_one
>>>     await dispatch(*args)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
>>>     await result
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
>>>     reply_content = await reply_content
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2901, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2947, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3172, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3364, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "/var/folders/g9/6qklj4h53bv0c1rjnffg7bmw0000gp/T/ipykernel_49794/2480421604.py", line 49, in <module>
>>>     baseline_bag_of_words1()
>>> 
>>>   File "/var/folders/g9/6qklj4h53bv0c1rjnffg7bmw0000gp/T/ipykernel_49794/2480421604.py", line 47, in baseline_bag_of_words1
>>>     model.fit(train_dataset.batch(32), epochs=args.epochs)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 816, in train_step
>>>     self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 530, in minimize
>>>     grads_and_vars = self._compute_gradients(
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 583, in _compute_gradients
>>>     grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
>>> 
>>>   File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 464, in _get_gradients
>>>     grads = tape.gradient(loss, var_list, grad_loss)
>>> 

### Rnn with embedding from scratch

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def rnn_with_embedding():
    encoder = get_encoder()
    
    model = keras.models.Sequential()
    model.add(encoder)
    model.add(keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True))
    model.add(keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    model.summary()
    
    monitor='val_loss'
    early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=10, mode='auto', restore_best_weights=True, verbose=1)
    reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=3, min_delta=1e-4, mode='auto', verbose=1)
    
    model.fit(train_dataset, epochs=args.epochs, validation_data=val_dataset, callbacks=[early_stopping, reduce_lr_on_plateau])

if False:
    rnn_with_embedding()

# Epoch 3/50
# 782/782 [======] - 314s 401ms/step - loss: 0.2752 - accuracy: 0.8867 - val_loss: 0.3107 - val_accuracy: 0.8667 - lr: 0.0010

### Different embeddings, glove, bert, transformer. 