In this notebook we will be experimenting multiple models: \
Experiment 0: baseline model (multinomialNB) \
Experiment 1: simple model with an embedding layer \
Experiment 2: same as before but using transfer learning for the embedding layer \
Experiment 3: BERT \
Experiment 4: Openai model \
(note to run experiment 4 you will need a .env file with your own Openai API key)

In [1]:
# Surpress warnings
import os

from datasets.utils import hub

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {‘0’, ‘1’, ‘2’}

import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from tensorflow.keras import layers
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
# Download dataset
dataset_name = "imdb_reviews/plain_text"
(train_data, test_data), ds_info = tfds.load(dataset_name,
                                              split=("train", "test"),
                                              shuffle_files=True,
                                              as_supervised=True,
                                              with_info=True)

In [3]:
# Check if this is a binary or multi-class classification problem
class_names = ds_info.features["label"].names
class_names

['neg', 'pos']

In [4]:
# Lets get our datasets in a more practical format
train_text = []
train_labels = []
test_text = []
test_labels = []

for train_sample, test_sample in zip(train_data, test_data): # Both datasets have size 25000
    train_text.append(train_sample[0].numpy().decode()) 
    train_labels.append(train_sample[1].numpy())
    test_text.append(test_sample[0].numpy().decode()) 
    test_labels.append(test_sample[1].numpy())

In [5]:
# Explore the dataset
random_index = random.randint(0, len(train_text)-5)
for text, label in zip(train_text[random_index:random_index+5], train_labels[random_index:random_index+5]):
    print(f"Sentiment: {class_names[label]}")
    print(f"Text: {text}\n\n")

Sentiment: pos
Text: I first watched the Walking Tall movies when I was about 8 years old and I thought both Joe Don Baker and Bo Svenson did a great job, they must have anyway because since watching the movies, I have tried to learn as much about the real Sheriff Buford Pusser as I can. All 3 parts of the movie gave me chills and Buford Pusser was a true hero, I only wish he were alive today and that there were more people like him. I would love to thank him for getting rid of all the crime and being so brave. I am very sorry that his family had to go through such horror and pain. My heart goes out to them. So from a 30 year old fan of Sheriff Pusser and of the 3-part Walking Tall movies and the actors that portrayed him, please do not be negative about these movies and actors, they were only trying to let us know what a wonderful man the real Buford Pusser was and what a great family he had. And to all the young people who may have not heard much about Buford, I suggest you watch the

Experiment 0

In [6]:
# Check average length of text data
average_length = round(sum([len(text.split()) for text in train_text]) / len(train_text))
average_length

234

In [7]:
# Prepare data for the baseline model
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_text)
test_vectors = vectorizer.transform(test_text)

In [8]:
# Create a baseline model
baseline_model = MultinomialNB()
baseline_model.fit(train_vectors, train_labels)

In [9]:
# Test model
y_preds = baseline_model.predict(test_vectors)
y_preds

array([1, 1, 0, ..., 0, 1, 1])

In [10]:
# Calculate accuracy
models_results = {} # dict to save all models accuracy metrics
baseline_accuracy = np.mean(y_preds == test_labels)
models_results["baseline"] = baseline_accuracy
baseline_accuracy

0.82956

Experiment 1

In [11]:
# Prepare data for the following experiments
max_vocab_length = 10000
max_seq_length = average_length

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_length)
tokenizer.fit_on_texts(train_text)

train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq_length)

test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq_length)

In [12]:
# Print tokenized texts
train_padded

array([[   0,    0,    0, ...,  866,  140,    9],
       [   0,    0,    0, ...,   19,   30,   29],
       [   0,    0,    0, ...,  416,    8, 6109],
       ...,
       [  97,    7,    7, ...,   15,   11,   28],
       [   0,    0,    0, ...,   77, 1289,   22],
       [   5, 1740,    1, ...,   31, 1662,  708]], dtype=int32)

In [13]:
# Create optimized tokenized datasets
train_dataset_tokenized = tf.data.Dataset.from_tensor_slices((train_padded, train_labels))
train_dataset_tokenized = train_dataset_tokenized.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset_tokenized = tf.data.Dataset.from_tensor_slices((test_padded, test_labels))
test_dataset_tokenized = test_dataset_tokenized.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [7]:
# Create optimized non tokenized datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_text, train_labels))
train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_text, test_labels))
test_dataset = test_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [15]:
# Setup mixed_precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')
tf.keras.mixed_precision.global_policy()

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6


<Policy "mixed_float16">

In [15]:
# Create model for experiment 1
model_1 = tf.keras.Sequential([
    layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_seq_length),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1),
    layers.Activation('sigmoid', dtype=tf.float32)
])

model_1.compile(
    loss='binary_crossentropy',
    optimizer="adam",
    metrics=['accuracy']
)

In [16]:
# Fit model 1
history_1 = model_1.fit(train_dataset,
                    validation_data=test_dataset,
                    validation_steps=int(0.1*len(test_dataset)),
                    epochs=5)

Epoch 1/5
  2/782 [..............................] - ETA: 1:18 - loss: 0.6959 - accuracy: 0.3594 

I0000 00:00:1718046961.600279  172376 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# Evaluate model
accuracy = model_1.evaluate(test_dataset)[1]
models_results["model_1"] = accuracy
accuracy



0.8839600086212158

Experiment 2

In [18]:
# Import USE from tensorflow hub
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                 input_shape=[],
                                 dtype=tf.string,
                                 trainable=False)

In [21]:
# Create model 2
model_2 = tf.keras.Sequential([
    embedding_layer,
    layers.Dense(1),
    layers.Activation('sigmoid', dtype=tf.float32)
])

model_2.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [22]:
# Fit model 2
history_2 = model_2.fit(np.array(train_text), np.array(train_labels),
                        validation_data=(np.array(test_text), np.array(test_labels)),
                        validation_steps=int(0.1*(len(test_text)/32)),
                        epochs=5,
                        batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
# Evaluate model 2
accuracy_2 = model_2.evaluate(np.array(test_text), np.array(test_labels))
models_results["model_2"] = accuracy_2
accuracy_2



[0.36524835228919983, 0.8482400178909302]

Experiment 3

In [8]:
# Create model with BERT
text_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

x = tf.keras.layers.Dropout(0.2)(pooled_output)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dense(1)(x)
output = tf.keras.layers.Activation("sigmoid", dtype=tf.float32)(x)

model_3 = tf.keras.Model(text_input, output)

In [9]:
model_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history_3 = model_3.fit(train_dataset,
                        validation_data=test_dataset,
                        validation_steps=int(0.1*len(test_dataset)),
                        epochs=5)

Epoch 1/5


I0000 00:00:1718200384.250002   18648 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# Evaulate model 3
accuracy_3 = model_3.evaluate(test_dataset)[1]
models_results["model_3"] = accuracy_3
accuracy_3



NameError: name 'models_results' is not defined