<a href="https://colab.research.google.com/github/NickSlm/ml1/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

In [2]:
dataset = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
(X_train, y_train), (X_test, y_test) = dataset

In [4]:
test_seq = X_train[0][:10]

In [5]:
word_index = keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [6]:
id_to_word = dict((id_ + 3,word) for word, id_ in word_index.items())
id_to_word [1] = "[START]"
id_to_word [2] = "[OOV]"
" ".join(id_to_word[id_] for id_ in test_seq)

'[START] this film was just brilliant casting location scenery story'

In [7]:
dataset, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWVHVNT/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWVHVNT/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWVHVNT/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [8]:
dataset

{Split('train'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('test'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('unsupervised'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [9]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [20]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")    # replace all <br>, <br/>
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [21]:
from collections import Counter

In [22]:
vocab = Counter()
for X_batch, y_batch in dataset["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocab.update(list(review.numpy()))

In [28]:
n_words = 10000
truncated_vocab = [word for word, times in vocab.most_common()[:n_words]]

In [39]:
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)
words = tf.constant(truncated_vocab)
table_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(table_init, oov_buckets)

In [41]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [44]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = dataset["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [47]:
model = keras.models.Sequential([
    keras.layers.Embedding(n_words + oov_buckets, 128, mask_zero = True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
