In [6]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import os
import re
import shutil
import string

In [7]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
print(dataset)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
.\aclImdb_v1.tar.gz


In [10]:
print(os.path.dirname(dataset))
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
print(dataset_dir)

.
.\aclImdb


In [11]:
os.listdir(dataset_dir)


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [12]:
train_dir = os.path.join(dataset_dir, 'train')
print(train_dir)
os.listdir(train_dir)

.\aclImdb\train


['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [14]:
                                                    # Load the dataset
# Removing add. folders
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)


In [17]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='training', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [19]:
# Creating a validation dataset using remaining 5000 reviews from train dataset
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='validation', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [21]:
# Creating a test dataset
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test')

Found 25000 files belonging to 2 classes.


In [24]:
# Preparing dataset for training
def custom_standardization(input_data):
  # Converting all uppercase characters into lowercase
  lowercase = tf.strings.lower(input_data)
  # Removing < /br>
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  # Removing all punctuations
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'')



In [25]:
# Creating a TextVectorization layer
vectorize_layer = TextVectorization(
    # Passing my custom function
    standardize=custom_standardization,
    # Size of vocabulary
    max_tokens=10000,
    # INT means to create unique integer indices for each token.
    output_mode='int',
    # No idea what this is doing ???
    output_sequence_length=250)

In [28]:
                        # Using adapth method to turn strings into tokens then into integers
# Make a text-only dataset (without labels) ?????
train_text = raw_train_ds.map(lambda x, y: x)
# Note only use your training data when calling adapt 
vectorize_layer.adapt(train_text)

In [30]:
                        # To see the results of using TextVectorization
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review = text_batch[0]
first_label = label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'Silent Night, Deadly Night 5 is the very last of the series, and like part 4, it\'s unrelated to the first three except by title and the fact that it\'s a Christmas-themed horror flick.<br /><br />Except to the oblivious, there\'s some obvious things going on here...Mickey Rooney plays a toymaker named Joe Petto and his creepy son\'s name is Pino. Ring a bell, anyone? Now, a little boy named Derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him. Even though it said "don\'t open till Christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself. Inside is a little red ball that sprouts Santa arms and a head, and proceeds to kill dad. Oops, maybe he should have left well-enough alone. Of course Derek is then traumatized by the incident since he watched it from the stairs, but he doesn\'t grow up to be some killer Santa, he just stops talking.<br /><br />T

In [31]:
                        # Using TextVectorization on datasets
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [32]:
                       # Configure the dataset for performance
    #???
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


In [34]:
                        # Create the model
embedding_dim = 16
model = tf.keras.Sequential([
  layers.Embedding(10000 + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [38]:
                # Compiling the model
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), # output is either pos or neg
              optimizer='adam',                                 # stochastic gradient descent   
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) # Calculates how often predictions matches binary labels.

In [39]:
                # Training the model
history = model.fit(
    train_ds,
    validation_data=val_ds)

