In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import os
import re
import shutil
import string

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
print(dataset)

.\aclImdb_v1.tar.gz


In [5]:
print(os.path.dirname(dataset))
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
print(dataset_dir)

.
.\aclImdb


In [6]:
os.listdir(dataset_dir)


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [7]:
train_dir = os.path.join(dataset_dir, 'train')
print(train_dir)
os.listdir(train_dir)

.\aclImdb\train


['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [8]:
                                                    # Load the dataset
# Removing add. folders
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)
print(tf.__version__)

2.3.0


In [9]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='training', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [10]:
# Creating a validation dataset using remaining 5000 reviews from train dataset
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='validation', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [11]:
# Creating a test dataset
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test')

Found 25000 files belonging to 2 classes.


In [12]:
# Preparing dataset for training
def custom_standardization(input_data):
  # Converting all uppercase characters into lowercase
  lowercase = tf.strings.lower(input_data)
  # Removing < /br>
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  # Removing all punctuations
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'')



In [13]:
# Creating a TextVectorization layer
vectorize_layer = TextVectorization(
    # Passing my custom function
    standardize=custom_standardization,
    # Size of vocabulary
    max_tokens=10000,
    # INT means to create unique integer indices for each token.
    output_mode='int',
    # No idea what this is doing ???
    output_sequence_length=250)

In [14]:
                        # Using adapth method to turn strings into tokens then into integers
# Make a text-only dataset (without labels)
train_text = raw_train_ds.map(lambda x, y: x)
# Note only use your training data when calling adapt 
vectorize_layer.adapt(train_text)

In [15]:
                        # To see the results of using TextVectorization
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review = text_batch[0]
first_label = label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"Having seen most of Ringo Lam's films, I can say that this is his best film to date, and the most unusual. It's a ancient china period piece cranked full of kick-ass martial arts, where the location of an underground lair full of traps and dungeons plays as big a part as any of the characters. The action is fantastic, the story is tense and entertaining, and the set design is truely memorable. Sadly, Burning Paradise has not been made available on DVD and vhs is next-to-impossible to get your mitts on, even if you near the second biggest china-town in North America (like I do). If you can find it, don't pass it up.", shape=(), dtype=string)
Label pos
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[ 253,  105,   88,    5,    1,    1,   94,   10,   68,  131,   12,
          11,    7,   24,  113,   19,    6, 1290,    3,    2,   88, 1603,
          29,    4, 2216, 2674,  840,  411,    1,  374,    5,    1, 1691,
        1741,  114,    2, 1652,  

In [16]:
                        # Using TextVectorization on datasets
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [17]:
                       # Configure the dataset for performance

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


In [18]:
                        # Create the model
embedding_dim = 16
model = tf.keras.Sequential([
  layers.Embedding(10000 + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [19]:
                # Compiling the model
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), # output is either pos or neg
              optimizer='adam',                                 # stochastic gradient descent   
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) # Calculates how often predictions matches binary labels.

In [21]:
                # Training the model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    verbose=2)

Epoch 1/3
625/625 - 2s - loss: 0.5482 - binary_accuracy: 0.8008 - val_loss: 0.4982 - val_binary_accuracy: 0.8224
Epoch 2/3
625/625 - 2s - loss: 0.4450 - binary_accuracy: 0.8442 - val_loss: 0.4201 - val_binary_accuracy: 0.8470
Epoch 3/3
625/625 - 2s - loss: 0.3784 - binary_accuracy: 0.8658 - val_loss: 0.3737 - val_binary_accuracy: 0.8610
