In [6]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import os
import re
import shutil
import string

In [7]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
print(dataset)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
.\aclImdb_v1.tar.gz


In [10]:
print(os.path.dirname(dataset))
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
print(dataset_dir)

.
.\aclImdb


In [11]:
os.listdir(dataset_dir)


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [12]:
train_dir = os.path.join(dataset_dir, 'train')
print(train_dir)
os.listdir(train_dir)

.\aclImdb\train


['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [14]:
                                                    # Load the dataset
# Removing add. folders
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)


In [17]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='training', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [19]:
# Creating a validation dataset using remaining 5000 reviews from train dataset
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',  
    validation_split=0.2, 
    subset='validation', 
    seed=42)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [21]:
# Creating a test dataset
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test')

Found 25000 files belonging to 2 classes.


In [24]:
# Preparing dataset for training
def custom_standardization(input_data):
  # Converting all uppercase characters into lowercase
  lowercase = tf.strings.lower(input_data)
  # Removing < /br>
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  # Removing all punctuations
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'')



In [25]:
# Creating a TextVectorization layer
vectorize_layer = TextVectorization(
    # Passing my custom function
    standardize=custom_standardization,
    # Size of vocabulary
    max_tokens=10000,
    # INT means to create unique integer indices for each token.
    output_mode='int',
    # No idea what this is doing ???
    output_sequence_length=250)

In [28]:
                        # Using adapth method to turn strings into tokens then into integers
# Make a text-only dataset (without labels) ?????
train_text = raw_train_ds.map(lambda x, y: x)
# Note only use your training data when calling adapt 
vectorize_layer.adapt(train_text)

In [None]:
                        # To see the results of using TextVectorization
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review = text_batch[0]
first_label = label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))