In [1]:
import tensorflow as tf
import numpy as np
import os
import re
import string
import shutil
import matplotlib.pyplot as plt

data = tf.keras.utils.get_file("aclImdb_v1",
                              "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                              untar=True,cache_dir='.',cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(data),'aclImdb')
train_dir = os.path.join(dataset_dir,'train')

shutil.rmtree(os.path.join(train_dir,'unsup'))

batch_size = 32
seed  = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train/',
                                                     batch_size= batch_size,
                                                     validation_split=0.2,
                                                     seed=seed,
                                                     subset="training")

raw_val_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train/',
                                                   batch_size=batch_size,
                                                   validation_split=0.2,
                                                   seed=seed,
                                                   subset='validation')

raw_test_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test/',
                                                        batch_size=batch_size)

def custom_standardization(input_data):
    lower_case = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lower_case,'<br />',' ')
    return tf.strings.regex_replace(
        stripped_html,
        f'[{re.escape(string.punctuation)}]',
        ''
    )

max_features = 10000
sequence_length = 250

vectorization_layer = tf.keras.layers.TextVectorization(max_tokens=max_features,
                                                        standardize=custom_standardization,
                                                        output_mode='int',
                                                        output_sequence_length=sequence_length
                                                       )

train_text = raw_train_ds.map(lambda x,y : x)
vectorization_layer.adapt(train_text)

def vectorize_text(text,label):
    text = tf.expand_dims(text,-1)
    return vectorization_layer(text),label

AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)

#model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features+1,16),
    tf.keras.layers.Conv1D(8,7,activation="relu"),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(8,activation="relu"),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])

history = model.fit(train_ds,validation_data=val_ds,epochs=10)

## Testing
print("\n")
loss,accuracy = model.evaluate(test_ds)
print(f"Loss : {loss}  ; Accuracy : {accuracy}")


ans = np.where(model.predict(test_ds.take(1)) > 0,1,0)
print("\n",ans)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Loss : 0.4515586793422699  ; Accuracy : 0.8481600284576416

 [[1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]]
