In [19]:
import logging
import argparse 
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger()
%config Completer.use_jedi = False # make autocompletion works in jupyter

args = argparse.Namespace()
args.data_folder = './data-ignored/imdb/'
args.val_fraction = 0.25
args.vocab_size = 2500
args.small_vocab_size = 250
args.epochs = 50
args.batch_size = 32

Path(args.data_folder).mkdir(parents=True, exist_ok=True)

ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir=args.data_folder)
train_ds_len= tf.data.experimental.cardinality(ds['train']).numpy()
test_ds_len= tf.data.experimental.cardinality(ds['test']).numpy() 
print(train_ds_len)
for d in ds['train'].take(1):
    print(d)
    
train_dataset = ds['train'].batch(args.batch_size)
val_dataset = ds['test'].batch(args.batch_size).take(int(args.val_fraction * (train_ds_len + test_ds_len)))
test_dataset = ds['test'].batch(args.batch_size).skip(int(args.val_fraction * (train_ds_len + test_ds_len)))

2021-11-17 14:29:41,605 : INFO : No config specified, defaulting to first: imdb_reviews/plain_text
2021-11-17 14:29:41,612 : INFO : Load dataset info from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0
2021-11-17 14:29:41,623 : INFO : Reusing dataset imdb_reviews (./data-ignored/imdb/imdb_reviews/plain_text/1.0.0)
2021-11-17 14:29:41,624 : INFO : Constructing tf.data.Dataset imdb_reviews for split None, from ./data-ignored/imdb/imdb_reviews/plain_text/1.0.0


25000
(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


2021-11-17 14:29:41.866263: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Baseline

In [16]:
import functools

@functools.lru_cache(maxsize=10)
def get_encoder(vocab_size=args.vocab_size):
    encoder = TextVectorization(max_tokens=vocab_size)
    encoder.adapt(train_dataset.map(lambda text, label: text))
    return encoder

In [60]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def rnn_with_embedding():
    encoder = get_encoder()
    
    model = keras.models.Sequential()
    model.add(encoder)
    model.add(keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True))
    model.add(keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    model.summary()
    
    monitor='val_loss'
    early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=10, mode='auto', restore_best_weights=True, verbose=1)
    reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=3, min_delta=1e-4, mode='auto', verbose=1)
    
    model.fit(train_dataset, epochs=args.epochs, validation_data=val_dataset, callbacks=[early_stopping, reduce_lr_on_plateau])

rnn_with_embedding()

# Epoch 3/50
# 782/782 [======] - 314s 401ms/step - loss: 0.2752 - accuracy: 0.8867 - val_loss: 0.3107 - val_accuracy: 0.8667 - lr: 0.0010

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_6 (Embedding)     (None, None, 64)          160000    
                                                                 
 bidirectional_6 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dense_13 (Dense)            (None, 1)                 65        
                                                                 
Total params: 234,369
Trainable params: 234,369
Non-t

KeyboardInterrupt: 

### Different embeddings, glove, bert, transformer.

### Baseline. Bag of words

In [89]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def baseline_bag_of_words():
    
    class BagOfWords(tf.keras.layers.Layer):
        def __init__(self, vocab_size=args.small_vocab_size):
            super(BagOfWords, self).__init__()
            self.vocab_size = vocab_size
            self.initial_value = tf.zeros(vocab_size)

        def build(self, input_shape):
            super().build(input_shape)

        def call(self, inputs):
            print(inputs)
            print(self.input_shape)
            outputs = tf.Variable(self.initial_value)
            # # for i in inputs:
            # #     print(i)
            # for i in range(self.input_shape[-1]):
            #     ouput_idx = inputs[i]
            #     outputs[ouput_idx].assign(outputs[ouput_idx] + 1) 
            return outputs

    encoder = get_encoder(args.small_vocab_size)
    bag_of_words = BagOfWords(args.small_vocab_size)
    
    model = keras.models.Sequential()
    model.add(encoder)
    model.add(bag_of_words)
    model.summary()
    
    for d in ds['train'].take(1):
        print(model(d[0]))
    # print(encoder.get_vocabulary())
#     model.add(keras.layers.Embedding(
#         input_dim=len(encoder.get_vocabulary()),
#         output_dim=64,
#         # Use masking to handle the variable sequence lengths
#         mask_zero=True))
#     model.add(keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
#     model.add(keras.layers.Dense(64, activation='relu'))
#     model.add(keras.layers.Dense(1, activation='sigmoid'))
    
#     model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
    
#     monitor='val_loss'
#     early_stopping = keras.callbacks.EarlyStopping(monitor=monitor, patience=10, mode='auto', restore_best_weights=True, verbose=1)
#     reduce_lr_on_plateau = keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=3, min_delta=1e-4, mode='auto', verbose=1)
    
#     model.fit(train_dataset, epochs=args.epochs, validation_data=val_dataset, callbacks=[early_stopping, reduce_lr_on_plateau])

baseline_bag_of_words()

# Epoch 3/50
# 782/782 [======] - 314s 401ms/step - loss: 0.2752 - accuracy: 0.8867 - val_loss: 0.3107 - val_accuracy: 0.8667 - lr: 0.0010

Tensor("Placeholder:0", shape=(None, None), dtype=int64)


AttributeError: Exception encountered when calling layer "bag_of_words_42" (type BagOfWords).

in user code:

    File "/var/folders/g9/6qklj4h53bv0c1rjnffg7bmw0000gp/T/ipykernel_36013/61537670.py", line 16, in call  *
        print(self.input_shape)
    File "/Users/mkhokhlush/github/ml-experiments/.venv/lib/python3.8/site-packages/keras/engine/base_layer.py", line 2174, in input_shape
        raise AttributeError(f'The layer "{self.name}" has never been called '

    AttributeError: The layer "bag_of_words_42" has never been called and thus has no defined input shape.


Call arguments received:
  • inputs=tf.Tensor(shape=(None, None), dtype=int64)

In [7]:
import tensorflow as tf
# for i in tf.zeros(3):
#     i += 1
#     print(i)
    
t = tf.Variable(tf.zeros(3))
for i in range(3):
    print(t[i])
    t[i].assign(t[i] + 1)
# tt = tf.unstack(t)
# for i in tt:
#     print(t[i])
# t[1].assign(12)
t

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


<tf.Variable 'Variable:0' shape=(3,) dtype=float32, numpy=array([1., 1., 1.], dtype=float32)>

### Bert