In [14]:
import os
import shutil
import random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import keras

### Binary classification for positive negative.

### Input: IMDB

In [11]:
#IMDB DATASET

# Function to load reviews from a directory
def load_reviews(directory, label):
    reviews = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(label)
    return reviews, labels

#set directories wihtin GitHub!
train_dir = "data/aclImdb/train"
test_dir = "data/aclImdb/test"
train_pos_reviews, train_pos_labels = load_reviews(os.path.join(train_dir, 'pos'), 1)
train_neg_reviews, train_neg_labels = load_reviews(os.path.join(train_dir, 'neg'), 0)

# Load test data
test_pos_reviews, test_pos_labels = load_reviews(os.path.join(test_dir, 'pos'), 1)
test_neg_reviews, test_neg_labels = load_reviews(os.path.join(test_dir, 'neg'), 0)

# Combine positive and negative reviews
train_reviews = train_pos_reviews + train_neg_reviews
train_labels = train_pos_labels + train_neg_labels
test_reviews = test_pos_reviews + test_neg_reviews
test_labels = test_pos_labels + test_neg_labels

# Shuffle the data
train_data = list(zip(train_reviews, train_labels))
test_data = list(zip(test_reviews, test_labels))
random.shuffle(train_data)
random.shuffle(test_data)


train_reviews, train_labels = zip(*train_data)
test_reviews, test_labels = zip(*test_data)

# Convert to numpy arrays
train_reviews = np.array(train_reviews)
train_labels = np.array(train_labels)
test_reviews = np.array(test_reviews)
test_labels = np.array(test_labels)

## word embedding layer

In [17]:
#Miguels approach for reading the data

# Base path for the dataset
dataset_path = 'data/aclImdb/'

train_dataset = keras.utils.text_dataset_from_directory(os.path.expanduser(dataset_path), batch_size=32)
valid_dataset = keras.utils.text_dataset_from_directory(os.path.expanduser(dataset_path), batch_size=32)

Found 100005 files belonging to 2 classes.
Found 100005 files belonging to 2 classes.


In [12]:
#Load the pretrained embeddings
path_to_glove_file = "data/glove.6B/glove.6B.100d.txt"
embeddings_index = {}
with open(os.path.expanduser(path_to_glove_file)) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [18]:
max_length = 600
max_tokens = 20000

tokenizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=max_length, output_mode="int")

train_dataset_text_only = train_dataset.map(lambda x, y: x)

tokenizer.adapt(train_dataset_text_only)

train_dataset_int = train_dataset.map(lambda x, y: (tokenizer(x), y), num_parallel_calls=4)

valid_dataset_int = valid_dataset.map(lambda x, y: (tokenizer(x), y), num_parallel_calls=4)

embedding_dim = 100

vocabulary = tokenizer.get_vocabulary()

word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


embedding_layer = keras.layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

2025-05-03 13:05:06.204865: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [19]:
inputs = keras.Input(shape=(None,), dtype="int64")

embedded = embedding_layer(inputs)
x = keras.layers.Bidirectional(keras.layers.LSTM(32))(embedded)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary() 

callbacks = [keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)]

model.fit(train_dataset_int, validation_data=valid_dataset_int, epochs=20, callbacks=callbacks)


Epoch 1/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 153ms/step - accuracy: 0.7489 - loss: 0.5709 - val_accuracy: 0.7500 - val_loss: 0.5622
Epoch 2/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1089s[0m 348ms/step - accuracy: 0.7512 - loss: 0.5633 - val_accuracy: 0.7500 - val_loss: 0.5610
Epoch 3/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2351s[0m 752ms/step - accuracy: 0.7512 - loss: 0.5626 - val_accuracy: 0.7500 - val_loss: 0.5603
Epoch 4/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 156ms/step - accuracy: 0.7514 - loss: 0.5615 - val_accuracy: 0.7500 - val_loss: 0.5600
Epoch 5/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 155ms/step - accuracy: 0.7513 - loss: 0.5610 - val_accuracy: 0.7500 - val_loss: 0.5591
Epoch 6/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m486s[0m 155ms/step - accuracy: 0.7512 - loss: 0.5613 - val_accuracy: 0.7500 - val_los

<keras.src.callbacks.history.History at 0x1758519a0>

In [20]:
model.save("LSTM_IMDB_model.keras")

In [21]:
# CNN‑based sentiment classifier (assumes `embedding_layer`,
# `train_dataset_int`, and `valid_dataset_int` already exist)

from tensorflow import keras
from tensorflow.keras import layers

# ── Model definition ──────────────────────────────────────────────
inputs = keras.Input(shape=(None,), dtype="int64")
x = embedding_layer(inputs)

# parallel n‑gram feature extractors
c3 = layers.Conv1D(128, 3, activation="relu")(x)
c4 = layers.Conv1D(128, 4, activation="relu")(x)
c5 = layers.Conv1D(128, 5, activation="relu")(x)

# global max‑pool each feature map
p3 = layers.GlobalMaxPooling1D()(c3)
p4 = layers.GlobalMaxPooling1D()(c4)
p5 = layers.GlobalMaxPooling1D()(c5)

x = layers.concatenate([p3, p4, p5])
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

# ── Training ──────────────────────────────────────────────────────
callbacks = [
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]

model.fit(
    train_dataset_int,
    validation_data=valid_dataset_int,
    epochs=20,
    callbacks=callbacks
)

Epoch 1/20




[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 125ms/step - accuracy: 0.7326 - loss: 0.6071 - val_accuracy: 0.7500 - val_loss: 0.5557
Epoch 2/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 69ms/step - accuracy: 0.7513 - loss: 0.5624 - val_accuracy: 0.7504 - val_loss: 0.5504
Epoch 3/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 73ms/step - accuracy: 0.7511 - loss: 0.5572 - val_accuracy: 0.7519 - val_loss: 0.5318
Epoch 4/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 70ms/step - accuracy: 0.7522 - loss: 0.5482 - val_accuracy: 0.7608 - val_loss: 0.5144
Epoch 5/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 72ms/step - accuracy: 0.7555 - loss: 0.5353 - val_accuracy: 0.7671 - val_loss: 0.4912
Epoch 6/20
[1m3126/3126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 77ms/step - accuracy: 0.7603 - loss: 0.5233 - val_accuracy: 0.7814 - val_loss: 0.4695
Epoch 7/

<keras.src.callbacks.history.History at 0x175850710>

In [22]:
model.save("CNN_IMDB_model.keras")