In [1]:
import tensorflow as tf
import os
import re
import shutil
import string
import numpy as np

data = tf.keras.utils.get_file("aclImdb_v1",
                             "https://ai.stanford.edu/~amaas/data/sentiment/aclMndb_v1.tar.gz",
                             untar = True,cache_dir='.',cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(data),'aclImdb')
train_dir = os.path.join(dataset_dir,'train')

shutil.rmtree(os.path.join(train_dir,'unsup'))

batch_size = 128
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory("aclImdb/train/",
                                                     validation_split=0.2,
                                                     batch_size=batch_size,
                                                     subset="training",
                                                     seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory("aclImdb/train/",
                                                       validation_split=0.2,
                                                       batch_size=batch_size,
                                                       subset="validation",
                                                       seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory("aclImdb/test/",
                                                        batch_size=batch_size,)


AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.cache().prefetch(buffer_size = AUTOTUNE)
val_ds = raw_val_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = raw_test_ds.cache().prefetch(buffer_size = AUTOTUNE)

def custom_standardization(input_data):
    lower_case = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lower_case,'<br />',' ')
    return tf.strings.regex_replace(stripped_html,
                                   f'[{re.escape(string.punctuation)}]','')

vectorize_layer = tf.keras.layers.TextVectorization(standardize=custom_standardization,
                                                    max_tokens=10000,
                                                    output_mode='int',
                                                    output_sequence_length=250)

train_text = train_ds.map(lambda x,y : x)
vectorize_layer.adapt(train_text)

In [12]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim=len(vectorize_layer.get_vocabulary()),output_dim=64,
                              mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

model.fit(train_ds,validation_data=val_ds,epochs=10,validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1533432e170>

In [17]:
samples = np.array([
    'The movie is good',
    'The movie is not good'
])

pred = model.predict(samples)
print(pred)
print(np.where(pred > 0,1,0))

[[ 1.1898861]
 [-1.249563 ]]
[[1]
 [0]]
