<a href="https://colab.research.google.com/github/Nathan-Mekuria-Solomon/ML-practice/blob/main/Nathan-Mekuria-Solomon/ML-Practice/natural_language_processing/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment Analysis 2 <br>

In [2]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_datasets as tfds
import warnings
warnings.filterwarnings("ignore")

In [3]:
datasets, info = tfds.load("imdb_reviews", as_supervised= True, with_info= True)
trian_size = info.splits["train"].num_examples

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.6JOSB5_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.6JOSB5_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.6JOSB5_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value= b"<pad>"), y_batch

In [None]:
# construct the vocabulary
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
# truncate the vocabulary
voc_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:voc_size]]

In [None]:
# create lookup table
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype= tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"This movie is aaaamazing".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,     7, 10898]])>

In [None]:
# encode word using vocab table above
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

In [None]:
# preprocess the training set
train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
# model
embed_size = 128
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(voc_size + num_oov_buckets, embed_size, input_shape= [None]),
    tf.keras.layers.GRU(128, return_sequences= True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation= "sigmoid")
])

# compile model
model.compile(loss= "binary_crossentropy",
              optimizer= "adam",
              metrics= ["accuracy"])

# fit model
history = model.fit(train_set, epochs= 5)

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.5144 - loss: 0.6892
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.7614 - loss: 0.4937
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.8646 - loss: 0.3188
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9344 - loss: 0.1810
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9548 - loss: 0.1270


<keras.src.callbacks.history.History at 0x7ea622f11e10>