# Sentiment Analysis

In [33]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow_datasets as tfds
from collections import Counter

## Loading the dataset

In [2]:
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data()

In [3]:
x_train.shape # 25000 movie reviews. The list contains integer that reference to a word

(25000,)

In [4]:
x_train[0][:5] # First 5 words of the first review

[1, 14, 22, 16, 43]

In [5]:
word_index = keras.datasets.imdb.get_word_index() # Basically the look up table for the words - key are the words and the values the ids

In [6]:
id_to_word = {key+3: word for word, key in word_index.items()} # Key are now the ids and values the words

In [7]:
list(id_to_word.items())[:3] # First three key value pairs

[(34704, 'fawn'), (52009, 'tsukino'), (52010, 'nunnery')]

In [8]:
# The dataset holds the numbers 1,2 and 3 for nlp tags

In [9]:
for key, value in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[key] = value

In [10]:
id_to_word[0]

'<pad>'

In [12]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

2021-09-19 12:33:22.117934: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "Not found: Could not locate the credentials file.". Retrieving token from GCE failed with "Failed precondition: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/daniel/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-train.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-test.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-unsupervised.tfrecord...:   0%|          | 0/50000 [00:00<?, ? examples/s]

[1mDataset imdb_reviews downloaded and prepared to /home/daniel/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


2021-09-19 12:34:15.996012: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 12:34:16.024379: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 12:34:16.024998: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 12:34:16.026170: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [14]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/home/daniel/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitI

In [16]:
train_size = info.splits["train"].num_examples

## Reading the Datasets

In [30]:
for x_batch, y_batch in datasets["train"].take(4):
    print("Text: {}".format(x_batch))
    print("Label: {}".format(y_batch))
    print("________")

Text: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Label: 0
________
Text: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rub

2021-09-19 12:43:19.684018: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Some Preprocessing (copied and pasted)

In [18]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch # Default value is padding all tensor will be made to the same length and filled with padding. NOw we don't have a RaggedTensor anymore

## Constructing Vocabular

In [42]:
Counter("HELLO MY GUY")

Counter({'H': 1,
         'E': 1,
         'L': 2,
         'O': 1,
         ' ': 2,
         'M': 1,
         'Y': 2,
         'G': 1,
         'U': 1})

In [34]:
vocabulary = Counter()

In [43]:
for x_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    print(x_batch)
    break
    for review in x_batch:
        vocabulary.update(list(review.numpy()))

tf.Tensor(
[[b'This' b'was' b'an' ... b'<pad>' b'<pad>' b'<pad>']
 [b'I' b'have' b'been' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Mann' b'photographs' b'the' ... b'<pad>' b'<pad>' b'<pad>']
 ...
 [b'This' b'movie' b'never' ... b'went' b'st' b'<pad>']
 [b'Mike' b'Brady' b'Michael' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Honestly' b'Barbra' b'I' ... b'<pad>' b'<pad>' b'<pad>']], shape=(32, 60), dtype=string)


In [41]:
for x_batch, y_batch in datasets["train"].batch(32).take(4):
    for review in x_batch:
        print(review)
        print("____-----____")
    

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
____-----____
tf.Tensor(b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell as