In [16]:
import tensorflow as tf
import numpy as np
import string
import re
from tensorflow.keras.layers import TextVectorization

#### Download Dataset

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  16.4M      0  0:00:04  0:00:04 --:--:-- 17.7M


#### Dataset overview

The `aclImdb` dir contains `train/` and `test/` dir. Each dir contains `pos` and `neg` dir. Each `pos` or `neg` dir contains multiple text file and each of them represents one sample (i.e., text only)

In [5]:
!ls ac*/

imdbEr.txt  imdb.vocab	README	test  train


In [6]:
!ls ac*/train/

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [9]:
!cat ac*/train/pos/11694_7.txt

Chris Rock stars in this remake of Warren Beatty's Heaven Can Wait (itself a remake of the 1941 film Here Comes Mr. Jordan), a comedy about a man who dies before his time, before he can realize his dreams, and his adventures in his new (albeit temporary) body. In the Beatty version, the protagonist was a backup quarterback for the then-Los Angeles Rams. In Rock's hipper version, our lead character is a struggling young - and decidedly low-talent - standup comedian.<br /><br />It's very funny to see the razor-sharp Rock playing a bad comedian. It's kind of like seeing Tom Hanks play a bad actor. Lance Barton's dream is to play the legendary Apollo Theater on a non-amateur night. But every time he tries out his material, he's booed off the stage lustily - so much so that his nickname becomes "Booie." His jokes are lame, his delivery painful. In short, Lance is everything that the real Chris Rock isn't.<br /><br />Lance is also a bike messenger, and he's riding the streets on his way to t

In [11]:
# remove unwanted dir 
!rm -r ac*/train/unsup

#### Load dataset

In [12]:
train_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    validation_split=0.2,
    subset="training",
    seed=42,
)

# set seed to prevent train/val set overlap

val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
     validation_split=0.2,
    subset="validation",
    seed=42, 
)


test_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/test",    
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [13]:
print(f"Number of batches in train_ds: {tf.data.experimental.cardinality(train_ds)}")
print(f"Number of batches in val_ds: {tf.data.experimental.cardinality(val_ds)}")
print(f"Number of batches in test_ds: {tf.data.experimental.cardinality(test_ds)}")

Number of batches in train_ds: 625
Number of batches in val_ds: 157
Number of batches in test_ds: 782


#### Preview batches

In [15]:
for text, label in train_ds.take(1):
    print(f"Each batch contains {len(text)} samples")
    for i in range(3):
        print(f"Text: {text.numpy()[i]}")
        print(f"Label: {label.numpy()[i]}")

Each batch contains 32 samples
Text: b"Having seen most of Ringo Lam's films, I can say that this is his best film to date, and the most unusual. It's a ancient china period piece cranked full of kick-ass martial arts, where the location of an underground lair full of traps and dungeons plays as big a part as any of the characters. The action is fantastic, the story is tense and entertaining, and the set design is truely memorable. Sadly, Burning Paradise has not been made available on DVD and vhs is next-to-impossible to get your mitts on, even if you near the second biggest china-town in North America (like I do). If you can find it, don't pass it up."
Label: 1
Text: b'Caution: May contain spoilers...<br /><br />I\'ve seen this movie 3 times & I\'ve liked it every time. Upon seeing it again, I\'m always reminded of how good it is. An HBO TV movie- very well done like most of their movies are- this would\'ve gotten Oscars for it\'s performances had it been released for general distrib

#### Basic text cleaning

In [17]:
MAX_SEQ_LEN = 256
EMBEDDING_SIZE = 300
MAX_VOCAB_SIZE = 10000

In [18]:
def clean_text(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "<br />", " ")
    text = tf.strings.regex_replace(text, f"[{string.punctuation}]", "")
    return text

vectorized_layer = tf.keras.layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    standardize=clean_text,
    output_mode="int",
    output_sequence_length=MAX_SEQ_LEN,
)

train_text = train_ds.map(lambda x, y: x)
# learn vocab from training set text
vectorized_layer.adapt(train_text)

In [20]:
vectorized_layer.vocabulary_size()

10000

In [21]:
print(vectorized_layer("This is a demo sentence."))

tf.Tensor(
[  11    7    4    1 4320    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 

In [24]:
train_ds = train_ds.cache().prefetch(tf.data.AUTOTUNE)
val_ds  = val_ds.cache().prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(tf.data.AUTOTUNE)

#### Build Model

In [28]:
def build_model():
    input = tf.keras.Input(shape=(1,), dtype="string", name="input_str_layer")
    x = vectorized_layer(input)
    x = tf.keras.layers.Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_SIZE)(x)
    x = tf.keras.layers.Conv1D(128, 3, activation="relu", padding="valid", strides=2)(x)
    x = tf.keras.layers.Conv1D(256, 6, activation="relu", padding="valid", strides=2)(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dropout(rate=0.5)(x)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    
    model = tf.keras.Model(inputs=input, outputs=output)

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
        )
    
    return model

In [29]:
model = build_model()

In [31]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_str_layer (InputLayer) [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 300)          3000000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 127, 128)          115328    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 61, 256)           196864    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               65792 

#### Training

In [33]:
model.fit(
    train_ds,
    batch_size=32,
    validation_data=val_ds,
    verbose=1,
    epochs=3,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3ecf561dd0>

#### Testing

In [34]:
model.evaluate(test_ds)



[0.46564045548439026, 0.8467599749565125]

In [35]:
model.metrics_names

['loss', 'binary_accuracy']

#### Inferencing

In [52]:
test_sentence = tf.constant(["Terrible movie. I hate this so much. No no. ugly so bad"])

predictions = model(test_sentence)

print(f"Text: {test_sentence}")
print(f"Predictions: {predictions} ({'Positive' if predictions >= 0.5 else 'Negative'})")

Text: [b'Terrible movie. I hate this so much. No no. ugly so bad']
Predictions: [[0.00513014]] (Negative)


In [53]:
test_sentence = tf.constant(["BEST movie. I love this so much."])

predictions = model(test_sentence)

print(f"Text: {test_sentence}")
print(f"Predictions: {predictions} ({'Positive' if predictions >= 0.5 else 'Negative'})")

Text: [b'BEST movie. I love this so much.']
Predictions: [[0.94804955]] (Positive)
