In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
from collections import Counter
import shutil

In [2]:
DIR_PATH = os.path.join("datasets", "acllmdb")
FILE_PATH = os.path.join("datasets", 'aclImdb_v1.tar.gz')

In [3]:
# def extract_dataset(file_path):
#     os.makedirs(DIR_PATH, exist_ok=True)
#     shutil.unpack_archive(file_path, DIR_PATH)
# extract_dataset(FILE_PATH)

In [4]:
path = Path(DIR_PATH) / "aclImdb"
path

WindowsPath('datasets/acllmdb/aclImdb')

In [5]:
def read_files(dir_path):
    return [str(file) for file in dir_path.glob("*.txt")]

train_neg_data_files = read_files(path / "train" / "neg")
train_pos_data_files = read_files(path / "train" / "pos")
test_neg_data_files = read_files(path / "test" / "neg")
test_pos_data_files = read_files(path / "test" / "pos")

test_neg_data_files = test_neg_data_files[5000:]
test_pos_data_files = test_pos_data_files[5000:]
val_neg_data_files = test_neg_data_files[:5000]
val_pos_data_files = test_pos_data_files[:5000]

In [16]:
train_dataset_n = tf.data.TextLineDataset(train_neg_data_files)
train_dataset_p = tf.data.TextLineDataset(train_pos_data_files)
test_dataset_n = tf.data.TextLineDataset(train_neg_data_files)
test_dataset_p = tf.data.TextLineDataset(train_pos_data_files)
val_dataset_n = tf.data.TextLineDataset(val_neg_data_files)
val_dataset_p = tf.data.TextLineDataset(val_pos_data_files)

In [7]:
def merge_datasets(neg_ds, pos_ds, buffer_size=None, batch_size=32):
    neg_ds = neg_ds.map(lambda text: (text, 0))
    pos_ds = pos_ds.map(lambda text: (text, 1))
    dataset = tf.data.Dataset.concatenate(neg_ds, pos_ds)
    if buffer_size is not None:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

train_dataset = merge_datasets(train_dataset_n, train_dataset_p, 25000)
test_dataset = merge_datasets(test_dataset_n, test_dataset_p)
val_dataset = merge_datasets(val_dataset_n, val_dataset_p)

In [8]:
class BagOfWords(keras.layers.Layer):
    def __init__(self, n_tokens, dtype=tf.int32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.n_tokens = n_tokens
    def call(self, inputs):
        one_hot = tf.one_hot(inputs, self.n_tokens)
        return tf.reduce_sum(one_hot, axis=1)[:, 1:]

In [9]:
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)    
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

In [10]:
sample_batches = train_dataset.take(100).map(lambda x,y: x)
sample = np.concatenate(list(sample_batches.as_numpy_iterator()))

VOCAB_SIZE = 1000
bag_of_words = BagOfWords(VOCAB_SIZE)
text_vectorization = keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation", 
    max_tokens=VOCAB_SIZE
)

text_vectorization.adapt(sample)

In [11]:
model = keras.models.Sequential([
    text_vectorization,
    keras.layers.Embedding(input_dim=VOCAB_SIZE,
                           output_dim=20,
                           mask_zero=True),
    keras.layers.Lambda(compute_mean_embedding),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [12]:
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit(train_dataset, epochs=5, validation_data=val_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x227c2712e20>

In [13]:
test = tf.constant(["Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."])

In [14]:
model.predict(test)



array([[0.27851468]], dtype=float32)

In [15]:
pos = tf.constant(["If you like adult comedy cartoons, like South Park, then this is nearly a similar format about the small adventures of three teenage girls at Bromwell High. Keisha, Natella and Latrina have given exploding sweets and behaved like bitches, I think Keisha is a good leader. There are also small stories going on with the teachers of the school. There's the idiotic principal, Mr. Bip, the nervous Maths teacher and many others. The cast is also fantastic, Lenny Henry's Gina Yashere, EastEnders Chrissie Watts, Tracy-Ann Oberman, Smack The Pony's Doon Mackichan, Dead Ringers' Mark Perry and Blunder's Nina Conti. I didn't know this came from Canada, but it is very good. Very good!"])
model.predict(pos)



array([[0.96673506]], dtype=float32)