# Sentiment Analysis

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow_datasets as tfds
from collections import Counter
import time
import os

## Loading the dataset

In [2]:
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data()

In [3]:
x_train.shape # 25000 movie reviews. The list contains integer that reference to a word

(25000,)

In [4]:
x_train[0][:5] # First 5 words of the first review

[1, 14, 22, 16, 43]

In [5]:
word_index = keras.datasets.imdb.get_word_index() # Basically the look up table for the words - key are the words and the values the ids

In [6]:
id_to_word = {key+3: word for word, key in word_index.items()} # Key are now the ids and values the words

In [7]:
list(id_to_word.items())[:3] # First three key value pairs

[(34704, 'fawn'), (52009, 'tsukino'), (52010, 'nunnery')]

In [8]:
# The dataset holds the numbers 1,2 and 3 for nlp tags

In [9]:
for key, value in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[key] = value

In [10]:
id_to_word[0]

'<pad>'

In [11]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

2021-09-19 13:35:24.831479: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 13:35:24.837952: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 13:35:24.838560: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 13:35:24.839920: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [12]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/home/daniel/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitI

In [13]:
train_size = info.splits["train"].num_examples

## Reading the Datasets

In [14]:
for x_batch, y_batch in datasets["train"].take(4):
    print("Text: {}".format(x_batch))
    print("Label: {}".format(y_batch))
    print("________")

2021-09-19 13:35:25.447426: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Text: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Label: 0
________
Text: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rub

2021-09-19 13:35:25.500791: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Some Preprocessing (copied and pasted)

In [15]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch) # Splits words based on whitespace
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch # Default value is padding all tensor will be made to the same length and filled with padding. Now we don't have a RaggedTensor anymore.

## Constructing Vocabular

In [16]:
Counter(["HELLO", "MY", "GUY"])

Counter({'HELLO': 1, 'MY': 1, 'GUY': 1})

In [17]:
vocabulary = Counter()

In [18]:
for x_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    #print(x_batch.shape)
    #print(x_batch[0])
    #print(x_batch[0].shape)
    #print(x_batch[0][0])
    for review in x_batch:
        vocabulary.update(list(review.numpy())) # Counter can not work with tensors gets something like this ["word 1", "word 2",...]

In [19]:
vocabulary.most_common(4)

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564), (b'of', 33983)]

In [20]:
len(vocabulary) # That seems like a lot let's only keept the first 10.000

53893

## Filtering Vocabulary

In [21]:
vocab_size = 10000
truncated_vocab = [ word for word, count in vocabulary.most_common()[:vocab_size]]

In [22]:
words = tf.constant(truncated_vocab)

In [23]:
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)

In [24]:
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids) # Basically creating a lookupTable in TF or dict

In [25]:
num_oov_buckets = 10000

In [26]:
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets) # WTF? Chapter 13!

In [27]:
table.lookup(tf.constant("Hello, my opinion is dqwnfdoidwwqdwqeq".split()))

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([13250,    41,   590,     7, 15613])>

In [28]:
def encode_words(x_batch, y_batch):
    return table.lookup(x_batch), y_batch

## Building the model

In [29]:

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [30]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


### Defining the calmlbacks

In [31]:
root_logdir = "logs"

In [32]:
def get_run_logdir() -> str:
    run_id = time.strftime("run_%Y_%m_%d-%H-%M-%S")
    return os.path.join(root_logdir, run_id)

In [33]:
get_run_logdir()

'logs/run_2021_09_19-13-35-30'

In [34]:
tensorboard_cb = keras.callbacks.TensorBoard(get_run_logdir(),  update_freq= 1)

2021-09-19 13:35:30.815441: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-09-19 13:35:30.815468: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-09-19 13:35:30.815503: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2021-09-19 13:35:30.816059: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64
2021-09-19 13:35:30.917006: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2021-09-19 13:35:30.917178: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


In [35]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("shakespeare_text.h5")

In [36]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])


In [37]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [38]:
history = model.fit(train_set, epochs=10, callbacks=[tensorboard_cb, checkpoint_cb])

Epoch 1/10


2021-09-19 13:35:37.377262: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8100


 12/782 [..............................] - ETA: 30s - loss: 0.6939 - accuracy: 0.4792

2021-09-19 13:35:38.765234: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-09-19 13:35:38.765271: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-09-19 13:35:38.765781: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2021-09-19 13:35:38.782499: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-09-19 13:35:38.786780: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 0 callback api events and 0 activity events. 
2021-09-19 13:35:38.788236: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2021-09-19 13:35:38.792324: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/run_2021_09_19-13-35-30/train/plugins

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
