In [None]:
# Importing libraries

import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
# Download the movie review dataset and store it in keras's cache memory
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

print(dataset)


./aclImdb_v1


In [None]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
print(dataset_dir)

./aclImdb


In [None]:
#Print the contents of the "aclImdb" directory
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'train', 'test', 'README']

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['unsupBow.feat',
 'urls_pos.txt',
 'neg',
 'labeledBow.feat',
 'urls_unsup.txt',
 'urls_neg.txt',
 'pos',
 'unsup']

In [None]:
# Randomly select a file inside "pos" directory and print its content

sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [None]:
#Directory tree is expected by this function "text_dataset_from_directory"
"""
main_directory/
...class_a/
......a_text_1.txt
......a_text_2.txt
...class_b/
......b_text_1.txt
......b_text_2.txt
"""
#Remove the "unsup" directory tree (itself and its subdirectories and files)
remove_dir = os.path.join(train_dir, 'unsup') #create a path to the directory 
shutil.rmtree(remove_dir) #remove the unsup directory


In [None]:
batch_size = 32
seed = 42

#copy 80% of training data to "raw_train_ds" (training_split)
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size = batch_size,
    validation_split = 0.2,
    subset = 'training', seed = seed)
print(len(raw_train_ds))

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
625


In [None]:
#Take the first batch out of the 625 batches (625*32 batch_size = 20,000 training examples)
#Print all 32 examples in each batch

for text_batch, label_batch in raw_train_ds.take(1): 
  for i in range(32):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label 0
Review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [None]:
# Print corresponding string names of integer classes
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


In [None]:
#Copy remaining 2-% of training examples to "raw_val_ds" (validation split)
#Same seed ensures no duplicates in both splits
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size = batch_size,
    validation_split = 0.2,
    subset = 'validation', seed = seed)
print(len(raw_train_ds))

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
157


In [None]:
# Copy remaining 20% of training examples to "raw_val_ds" (validation split)
# Same seed ensures no duplicates in both splits
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
#create test split. Cop all the test examples to "raw_test_ds" (test split)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test',
    batch_size = batch_size)

Found 25000 files belonging to 2 classes.


In [None]:
# Define function for preprocessing text
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  #regex_replace(input, pattern, replace)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  # Replace punctuations with empty string (i.e. remove punctuations)
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
#Define layer that will perform vectorization
max_features = 10000 #take 10,000 unique words for preprocessing. This creates a token dictionary with max words being 10,000
sequence_length = 250 #length set for each review

vectorize_layer = TextVectorization(
    standardize=custom_standardization, 
    max_tokens = max_features, 
    output_mode='int',
    output_sequence_length = sequence_length)

In [None]:
#Make a text -only dataset (without lables), then call adapt() (i.e. fit)
train_text = raw_train_ds.map(lambda x, y:x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
# iter() makes it iterable, next() selects batch 0
text_batch, label_batch = next(iter(raw_train_ds))
#select first review out of 32 reviews in batch 0
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))


Review tf.Tensor(b"A very well made film set in early '60s communist Yugoslavia. The five young actors who are the teenagers at the center of the story give strong, sincere and emotionally deep performances. A clear depiction of how the natural trust and naivete inherent in teens can be easily manipulated and how that impacted the rest of their lives. Highly recommended.", shape=(), dtype=string)
Label pos
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   3,   52,   70,   90,   19,  258,    8,  397, 1668, 4948,    1,
           2,  692,  184,  153,   34,   23,    2, 2626,   30,    2, 3034,
           5,    2,   61,  198,  580, 5859,    4, 1665, 1007,  364,    3,
         669, 3025,    5,   87,    2, 1149, 1650,    4,    1, 5985,    8,
        2580,   68,   27,  768,    1,    4,   87,   12,    1,    2,  322,
           5,   66,  476,  519, 1148,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [None]:
# Print tokens corresponding to indeces
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  likely
 313 --->  poor
Vocabulary size: 10000


In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_dim = 16

In [None]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [None]:
# from_logits True implies y_pred are probabilities, anything >0 is predicated as 1
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate on test set
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.4611881971359253
Accuracy:  0.8242800235748291
