# To do:
1. Experiment with `output_mode`, `ngrams`
2. Tune `EMBEDDING_DIM` hyperparameter
3. https://www.tensorflow.org/text/tutorials/classify_text_with_bert


from: https://www.tensorflow.org/tutorials/keras/text_classification#prepare_the_dataset_for_training

# imports

In [160]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses

import pandas as pd

import re
import string

# Download the IMDB movie database

In [6]:
import tensorflow_datasets as tfds

In [13]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True)

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [8]:
os.listdir(dataset_dir)

['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [10]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [43]:
test_dir = os.path.join(dataset_dir, 'test')
os.listdir(test_dir)

['neg', 'urls_pos.txt', 'urls_neg.txt', 'pos', 'labeledBow.feat']

In [14]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

# Create train, validation, test datasets

In [None]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size)

Look at one batch

In [103]:
# look at one element in raw_train_df dataset
# view raw data and tensor metadata
list(raw_train_ds.take(1))

[(<tf.Tensor: shape=(32,), dtype=string, numpy=
  array([b"As far as parody films go, there are few that are worth time and energy. but with a recent resurgence of horrid parodies such as Date Movie and The Comebacks, it is a breath of fresh air to come back and rediscover a truly funny farce like Johnny Dangerously.<br /><br />After his mother has no end of medical problems, little Johnny goes to work for the mob. What fallows is a series of gags, most of which work, there are, however, the occasional flops. But a foreign gangster who can't master the American language (profanity wise, at least), a rival gangster with a penchant for shooting his mouth off (...once!), a younger brother with the D.A. who is out to get Johhny Dangerously, and a hot young starlet hot for his affections have Johnny busy.<br /><br />And the viewer will be busy laughing, for the most part, as every gangster-movie clich\xc3\xa9 is skewered by a talented cast and decent writing.<br /><br />Not perfect by a lon

In [102]:
# take one element and then convert to numpy iterable - view only raw data and not tensor metadata
list(raw_train_ds.take(1).as_numpy_iterator())

[(array([b"It's boggles the mind how this movie was nominated for seven Oscars and won one. Not because it's abysmal or because given the collective credentials of the creative team behind it really ought to deserve them but because in every category it was nominated Prizzi's Honor disappoints. Some would argue that old Hollywood pioneer John Huston had lost it by this point in his career but I don't buy it. Only the previous year he signed the superb UNDER THE VOLCANO, a dark character study set in Mexico, that ranks among the finest he ever did. Prizzi's Honor on the other hand, a film loaded with star power, good intentions and a decent script, proves to be a major letdown.<br /><br />The overall tone and plot of a gangster falling in love with a female hit-man prefigures the quirky crimedies that caught Hollywood by storm in the early 90's but the script is too convoluted for its own sake, the motivations are off and on the whole the story seems unsure of what exactly it's trying t

# Apply TextVectorization to data

https://keras.io/guides/preprocessing_layers/

In [141]:
MAX_FEATURES = 10000
SEQUENCE_LENGTH = 250

`\s = whitespace (spaces, tabs, newlines)`

`* = one or more preceeding characters`

`\\s* = one or more white spaces`

`/? = match ? 0 or 1 times or shortest match.`

`<br\\s*/?> = <br /> tags`

`[^something]	matches any character except those that [something] denotes; that is, immediately after the leading “[”, the circumflex “^” means “not” applied to all of the rest`

`[x-y]	matches any of the characters from x to y (inclusively) in the ASCII code`

need `\\s` because of this: `.` means any character in regex, but we want to match literal `"."` - how do we do this? We introduce an escape character `\` that goes with `.` like so `\.` -  `\.` means literaly ".". However regex is written using strings, where \ itself is an escape character. So we have to escape twice - "\\." means "\." in regex. So whenever you use "\" in regex, you need to actually use "\\". Replacing literal "\" would mean writing "\\\" I think. `\` "ecsapes" the normal function of whatever comes after it. So in Python when you have `\t`, it is interpreted to mean a tab indent, not the letter t. The normal `t` interpretation is "escaped". 

In [184]:
def custom_standardization(input_data):
    Z = tf.strings.substr(input_data, 0, 300)
    Z = tf.strings.lower(Z)
    Z = tf.strings.regex_replace(Z, b"<br\\s*/?>", b" ") # replaces all <br /> tags with spaces
    Z = tf.strings.regex_replace(Z, b"[^a-zA-Z']", b" ") #  replaces everything except letters and quotes with spaces
#     Z = tf.strings.regex_replace(Z,'[%s]' % re.escape(string.punctuation), b" ") # redundant
    return final_string

In [185]:
tf.keras.layers.TextVectorization(
    max_tokens=MAX_FEATURES,
    standardize=custom_standardization,
    split='whitespace',
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH, # Only valid in INT mode. Output has shape (batch_size, output_sequence_length) 
)

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f9ba90b86d0>

In [186]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

## Look at the result of textVectorization layer

In [187]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"Billy Crystal normally brings the crowd to laughter, but in this movie he and all the rest of them cannot bring any smile on my face.... or perhaps just one. They call it comedy, I say it's a waste of my time.", shape=(), dtype=string)
Label neg
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[1518, 5282, 1659,  907,    2, 2264,    6, 2040,   18,    8,   11,
          17,   27,    3,   30,    2,  351,    5,   93,  541,  717,   97,
        1706,   20,   54,  398,   41,  373,   40,   28,   34,  675,    9,
         220,   10,  131,   29,    4,  408,    5,   54,   58,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
  

Look at an example output. 

In [188]:
vectorize_text(first_review, first_label)[0].shape

TensorShape([1, 250])

In [189]:
sample = [vectorize_layer.get_vocabulary()[tensor] for tensor in vectorize_text(first_review, first_label)[0][0]]
sample[:]

['billy',
 'crystal',
 'normally',
 'brings',
 'the',
 'crowd',
 'to',
 'laughter',
 'but',
 'in',
 'this',
 'movie',
 'he',
 'and',
 'all',
 'the',
 'rest',
 'of',
 'them',
 'cannot',
 'bring',
 'any',
 'smile',
 'on',
 'my',
 'face',
 'or',
 'perhaps',
 'just',
 'one',
 'they',
 'call',
 'it',
 'comedy',
 'i',
 'say',
 'its',
 'a',
 'waste',
 'of',
 'my',
 'time',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 

In [190]:
# apply text vectorization
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Configure for performance

`.cache()` - keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files. This will save some operations (like file opening and data reading) from being executed during each epoch. IF YOU HAVE MANY SMALL FILES then the file open and read operations stack up, whereas if you only have one file on-disk, then you save yourself reading/opening/mapping.


`.prefetch()` -  overlaps data reading and data processing while training. While training, the next batch is loaded into the input buffer.

In [195]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Create model with embedding layer

The Embedding layer can be understood as a lookup table that maps from integer indices (which stand for specific words) to dense vectors (their embeddings).

Once trained, the learned word embeddings will roughly encode similarities between words (as they were learned for the specific problem your model is trained on).

In [213]:
embedding_dim = 100 # Hyper-parameter

In [283]:
model = keras.models.Sequential([
    keras.layers.Embedding(
        input_dim=MAX_FEATURES+1, # Integer. Size of the vocabulary, i.e. maximum integer index + 1.
        output_dim=embedding_dim, # Integer. Dimension of the dense embedding.
        input_length=SEQUENCE_LENGTH,
        mask_zero=True # can see from example above that inputs are padded with 0s
    ),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid'),
])

model.summary()

Model: "sequential_57"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_55 (Embedding)    (None, 250, 100)          1000100   
                                                                 
 dropout_18 (Dropout)        (None, 250, 100)          0         
                                                                 
 global_average_pooling1d_14  (None, 100)              0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_19 (Dropout)        (None, 100)               0         
                                                                 
 dense_104 (Dense)           (None, 1)                 101       
                                                                 
Total params: 1,000,201
Trainable params: 1,000,201
Non-trainable params: 0
___________________________________________

In [284]:
model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=tf.metrics.BinaryAccuracy()
)

epochs = 10
batch_size = 32
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
