In [18]:
import os, shutil, pathlib, random

base_dir = pathlib.Path('/Users/harivinayak/Interactive Data 25/aclImdb')
val_dir  = base_dir / "validation"
train_dir = base_dir / "train"

# Move 5,000 samples per class into validation
for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok=True)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    val_files = files[:5000]
    for fname in val_files:
        shutil.move(
            str(train_dir / category / fname),
            str(val_dir / category / fname)
        )

# Create small training set of 100 total (50 neg / 50 pos)
train_dir_1 = base_dir / "train1"
for category in ("neg", "pos"):
    os.makedirs(train_dir_1 / category, exist_ok=True)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    train_files = files[:50]
    for fname in train_files:
        shutil.move(
            str(train_dir / category / fname),
            str(train_dir_1 / category / fname)
        )


In [20]:
from tensorflow import keras

batch_size = 32
training   = keras.utils.text_dataset_from_directory(train_dir_1, batch_size=batch_size)
validation = keras.utils.text_dataset_from_directory(val_dir,    batch_size=batch_size)
testing    = keras.utils.text_dataset_from_directory(base_dir/"test", batch_size=batch_size)



Found 200 files belonging to 2 classes.
Found 24800 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [22]:
from tensorflow.keras import layers

max_length = 150
max_tokens = 10000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

text_only_train_ds = training.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = training.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_val_ds = validation.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
int_test_ds = testing.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)


In [24]:
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens,
    output_dim=256,
    mask_zero=True
)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.summary()


In [26]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "embeddings_bidir_lstm.keras",
        save_best_only=True
    )
]
history = model.fit(
    int_train_ds,
    validation_data=int_val_ds,
    epochs=10,
    callbacks=callbacks
)

loss, acc = model.evaluate(int_test_ds)
print("Model's accuracy:", round(acc * 100, 2), "%")



Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7552 - loss: 0.5576 - val_accuracy: 0.5934 - val_loss: 0.6695
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.8177 - loss: 0.4973 - val_accuracy: 0.5873 - val_loss: 0.6768
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.7684 - loss: 0.5215 - val_accuracy: 0.5922 - val_loss: 0.6726
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.7794 - loss: 0.5170 - val_accuracy: 0.5115 - val_loss: 0.8419
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7192 - loss: 0.5450 - val_accuracy: 0.5434 - val_loss: 0.7496
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.8287 - loss: 0.4421 - val_accuracy: 0.5708 - val_loss: 0.7157
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [28]:
import numpy as np
from tensorflow.keras.initializers import Constant

path_to_glove_file = '/Users/harivinayak/Interactive Data 25/glove.6B.100d.txt'
embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        embeddings_index[word] = np.fromstring(coefs, dtype="float32", sep=" ")

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_dim = 100
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens and word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

from tensorflow.keras import layers
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
    mask_zero=True
)


In [30]:
inputs = keras.Input(shape=(None,), dtype="int64")
x = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "glove_embeddings_sequence_model.keras",
        save_best_only=True
    )
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)


Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.5160 - loss: 0.7222 - val_accuracy: 0.5298 - val_loss: 0.6900
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.4862 - loss: 0.7017 - val_accuracy: 0.5144 - val_loss: 0.6950
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.5717 - loss: 0.6717 - val_accuracy: 0.5258 - val_loss: 0.6914
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.5771 - loss: 0.6704 - val_accuracy: 0.5516 - val_loss: 0.6853
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.6456 - loss: 0.6399 - val_accuracy: 0.5411 - val_loss: 0.6878
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.5542 - loss: 0.6706 - val_accuracy: 0.5624 - val_loss: 0.6820
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x16bbf81d0>

In [31]:
loss, acc = model.evaluate(int_test_ds)
print("Test acc:", round(acc, 3))



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.5707 - loss: 0.6796
Test acc: 0.572


In [32]:
train_ds = keras.utils.text_dataset_from_directory(train_dir_1, batch_size=batch_size)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

history = model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
loss, acc = model.evaluate(int_test_ds)
print("Model's accuracy:", round(acc * 100, 2), "%")


Found 200 files belonging to 2 classes.
Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.6879 - loss: 0.6110 - val_accuracy: 0.5795 - val_loss: 0.6741
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7822 - loss: 0.5467 - val_accuracy: 0.5856 - val_loss: 0.6717
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7561 - loss: 0.5420 - val_accuracy: 0.5903 - val_loss: 0.6693
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.7111 - loss: 0.5709 - val_accuracy: 0.5907 - val_loss: 0.6703
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.7632 - loss: 0.5240 - val_accuracy: 0.5278 - val_loss: 0.7995
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.7782 - loss: 0.5113 - val_accuracy: 0.6098 - val_loss: 0.6602
Epoch 7/10
[1

In [40]:
# for the 7000 
train_ds = keras.utils.text_dataset_from_directory(
    train_dir_1, batch_size=batch_size
)
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

history = model.fit(
    int_train_ds,
    validation_data=int_val_ds,
    epochs=10,
    callbacks=callbacks
)

loss, acc = model.evaluate(int_test_ds)
print("Model's accuracy:", round(acc * 100, 2), "%")



Found 14112 files belonging to 2 classes.
Epoch 1/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 68ms/step - accuracy: 0.9133 - loss: 0.2158 - val_accuracy: 0.8848 - val_loss: 0.3038
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 67ms/step - accuracy: 0.9182 - loss: 0.2064 - val_accuracy: 0.8896 - val_loss: 0.3009
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 64ms/step - accuracy: 0.9278 - loss: 0.1897 - val_accuracy: 0.8868 - val_loss: 0.3136
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 66ms/step - accuracy: 0.9264 - loss: 0.1852 - val_accuracy: 0.8978 - val_loss: 0.2907
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 65ms/step - accuracy: 0.9349 - loss: 0.1726 - val_accuracy: 0.8975 - val_loss: 0.2994
Epoch 6/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 65ms/step - accuracy: 0.9379 - loss: 0.1589 - val_accuracy: 

In [39]:
#14000
train_ds = keras.utils.text_dataset_from_directory(
    train_dir_1, batch_size=batch_size
)
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

history = model.fit(
    int_train_ds,
    validation_data=int_val_ds,
    epochs=10,
    callbacks=callbacks
)

loss, acc = model.evaluate(int_val_ds)
print(f"Validation Accuracy (7,100): {acc * 100:.2f}%")


Found 14112 files belonging to 2 classes.
Epoch 1/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 67ms/step - accuracy: 0.8553 - loss: 0.3328 - val_accuracy: 0.8538 - val_loss: 0.3351
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 67ms/step - accuracy: 0.8583 - loss: 0.3249 - val_accuracy: 0.8601 - val_loss: 0.3242
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 65ms/step - accuracy: 0.8684 - loss: 0.3119 - val_accuracy: 0.8553 - val_loss: 0.3385
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 65ms/step - accuracy: 0.8747 - loss: 0.2953 - val_accuracy: 0.8634 - val_loss: 0.3260
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 69ms/step - accuracy: 0.8800 - loss: 0.2872 - val_accuracy: 0.8665 - val_loss: 0.3180
Epoch 6/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 68ms/step - accuracy: 0.8859 - loss: 0.2734 - val_accuracy: 