# Sentiment Analysis

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow_datasets as tfds
from collections import Counter
import time
import os

## Loading the dataset

In [2]:
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data()

In [3]:
# Shapes
print(x_train.shape)
print(y_train.shape)

(25000,)
(25000,)


In [4]:
x_train[2][:10] # [Movie_Review_ID, Words]

[1, 14, 47, 8, 30, 31, 7, 4, 249, 108]

In [5]:
# Each Integer represents a word according to a word index
word_index = keras.datasets.imdb.get_word_index()

In [6]:
# We add 3 because of the three special tokens <pad> <sos> <unk>
word_index_dict = {id_+3: word for word, id_ in word_index.items()} # Words gonna start from index 3
tags = ["<pad>", "<sos>", "<unk>"] # index 0 - 2 are gonna be tags
for i in range(3):
    word_index_dict[i] = tags[i]

In [7]:
# Example - Review from specific movie id
" ".join([word_index_dict[x] for x in x_train[2][:]])

"<sos> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had earnt working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how embarrasing this is to watch save yourself an hour a bit of your life"

## Loading the dataset from TensorFlow Datasets
Da kommen die Dinger gleich als TF.Dataset

In [8]:
import tensorflow_datasets as tfds

In [9]:
datasets, info = tfds.load("imdb_reviews", as_supervised = True, with_info = True)

2021-10-18 00:20:33.335555: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-18 00:20:33.341101: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-18 00:20:33.341664: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-18 00:20:33.342416: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [10]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/home/daniel/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitI

In [11]:
train_size = info.splits["train"]

In [12]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300) # We only keep the first 300 words. 
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
     # X_Batch is a ragged tensor. Because some Movie Reviews have less then 300 words. That means we have a tensor with containing differen lst length
    X_batch = tf.strings.split(X_batch)
    X_batch = X_batch.to_tensor(default_value = b"<pad>") # Converting ragged tensor to a normal tensor by filling up the value with <pad>. Reviews now have same length

    return X_batch, y_batch

In [13]:
for x,y  in datasets["train"].take(1):
    print(x) # The movie review
    print("\n")
    print(y) # Class zero means negativ class one means positive

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)


tf.Tensor(0, shape=(), dtype=int64)


2021-10-18 00:20:33.933573: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-18 00:20:33.977664: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [14]:
train_dataset = datasets["train"].batch(16)

In [15]:
train_dataset.cache()

<CacheDataset shapes: ((None,), (None,)), types: (tf.string, tf.int64)>

In [16]:
train_dataset = train_dataset.map(preprocess)

## Finding most important words

In [17]:
from collections import Counter

In [18]:
vocabulary = Counter()

In [19]:
for x_batch, y_batch in train_dataset: 
    for review in x_batch: 
        vocabulary.update(list(review.numpy()))

In [20]:
vocabulary.most_common()[:10]

[(b'<pad>', 185093),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431),
 (b'to', 27707),
 (b'I', 27019),
 (b'is', 25719),
 (b'in', 18966),
 (b'this', 18490)]

## Truncating our Vocabulary

In [21]:
vocab_size = 10000

In [22]:
truncated_vocab = [word for word, count in vocabulary.most_common()[:vocab_size]] # Only keeping the 10.000 Words

### Creating a lookup table for our words

LoopupTable is a normal Key,Value Relationbship But if a new value is missing. A new Key will be generated for num_oov_buckets. 

In [23]:
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype = tf.int64)

In [24]:
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

In [25]:
num_oov_buckets = 1000 # Max number of new key, which can be generated. ! MEANS THAT OUR TOTAL VOCAB SIZE IS NOW => num_oov_bucket+ vocab_size

In [26]:
lookup_table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets=num_oov_buckets)

In [27]:
word_to_id = {word: index for index, word in enumerate(truncated_vocab)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


### Function for encoding words
#### To Transform our words in the dataset to intergers

In [28]:
def encode_words(x_batch, y_batch): 
    return lookup_table.lookup(x_batch), y_batch

In [29]:
train_dataset = train_dataset.map(encode_words)

In [30]:
train_dataset.prefetch(1)

<PrefetchDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int64)>

### Building the model 

Questions. 

1. Input Shape is strange
2. Can we transform that something more probabolisity Y?

In [31]:
def get_run_logdir() -> str:
    run_id = time.strftime("run_%Y_%m_%d-%H-%M-%S")
    return os.path.join("./logs_sentiment", run_id)

In [32]:
embed_size = 128

In [33]:
model = keras.models.Sequential(
[
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape = [None]),
    keras.layers.GRU(128, return_sequences = True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation = keras.activations.sigmoid)
])

In [34]:
tb_callback = keras.callbacks.TensorBoard("./logs_sentiment",  update_freq= 1)

2021-10-18 00:20:39.753884: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-10-18 00:20:39.753910: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-10-18 00:20:39.753941: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2021-10-18 00:20:39.754375: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64
2021-10-18 00:20:39.855374: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2021-10-18 00:20:39.855518: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


In [35]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("./checkpoint_sentiment/checkpoint.h5")

In [36]:
model.compile(loss=keras.losses.binary_crossentropy, optimizer = "adam", metrics = ["accuracy"])

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1408000   
_________________________________________________________________
gru (GRU)                    (None, None, 128)         99072     
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,606,273
Trainable params: 1,606,273
Non-trainable params: 0
_________________________________________________________________


In [38]:
get_run_logdir()

'./logs_sentiment/run_2021_10_18-00-20-39'

In [39]:
%load_ext tensorboard
%tensorboard --logdir=./logs_sentiment --port=6006

model.fit(train_dataset, epochs = 5, callbacks = [tb_callback, checkpoint_cb])

## WIth Tensorflow Hub Modules

In [63]:
datasets, info = tfds.load("imdb_reviews", as_supervised = True, with_info = True)

In [64]:
train_size = info.splits["train"]

In [65]:
train_dataset = datasets["train"].batch(16)

In [66]:
train_dataset.cache()

<CacheDataset shapes: ((None,), (None,)), types: (tf.string, tf.int64)>

In [67]:
train_dataset = train_dataset.prefetch(1)

In [69]:
import tensorflow_hub as hub

In [84]:
model = keras.models.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                           input_shape=[], dtype=tf.string),
    
    keras.layers.Dense(100, activation = "relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(150, activation = "relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation = "relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1, activation = "sigmoid")
    ])

In [85]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [86]:
model.fit(train_dataset, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa3a6965c10>