In [6]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# Hides the GPU from TensorFlow
tf.config.set_visible_devices([], 'GPU') 

# Base path for the dataset
dataset_path = 'data/aclImdb'

train_dataset = keras.utils.text_dataset_from_directory(os.path.expanduser(dataset_path), batch_size=50)    #batch size needs to be changed here
valid_dataset = keras.utils.text_dataset_from_directory(os.path.expanduser(dataset_path), batch_size=50)    #batch size needs to be changed here


# 1. Prepare text data from dataset
texts = []
labels = []

for text_batch, label_batch in train_dataset:
    for text, label in zip(text_batch.numpy(), label_batch.numpy()):
        texts.append(text.decode('utf-8'))
        labels.append([label]) # Convert to list for consistency

print(f"Number of training examples: {len(texts)}")
print(f"Example text: {texts[0][:100]}...")
print(f"Example label: {labels[0]}")

# 2. Tokenize and pad

max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = np.array(labels)

# 3. Load GloVe embeddings
embedding_dim = 50
embeddings_index = {}

glove_path = 'glove.6B.50d.txt'
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
 
# 4. Prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Found 100005 files belonging to 2 classes.
Found 100005 files belonging to 2 classes.
Number of training examples: 100005
Example text: this is a below average martial arts films which is worth watching for the comedy value due to the p...
Example label: [0]


In [9]:
## CHANGE FROM HERE ONWARDS

# 5. Build a simple model
# this was the previous model initially in the file
"""model = models.Sequential([
    layers.InputLayer(input_shape=(max_len,)),
    layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    ),
    layers.Conv1D(64, 3, activation='relu'),  # Add local feature extraction
    layers.GlobalMaxPooling1D(),  # Max pooling captures the most important features
    layers.Dropout(0.2),  # Add regularization to prevent overfitting
    layers.Dense(32, activation='relu'),  # Increase from 16 to 32
    layers.Dense(16, activation='relu'),  # Add another layer
    layers.Dropout(0.2),  # Additional dropout
    layers.Dense(y.shape[1], activation='sigmoid')
])"""

# other model using CNN - the number of filters and kernel size should be tuned (maybe using a for loop)
num_filters = 128  
kernel_size = 3
cnn_model = models.Sequential([
    layers.InputLayer(input_shape=(max_len,)),
    layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    ),
    layers.Conv1D(num_filters, kernel_size, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(y.shape[1], activation='softmax' if y.shape[1] > 2 else 'sigmoid')
])
cnn_model.compile(optimizer='adam',
              loss='categorical_crossentropy' if y.shape[1] > 2 else 'binary_crossentropy',
              metrics=['accuracy'])
model = cnn_model  # Assign the chosen model to 'model'

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X, y, epochs=10, verbose=1, validation_split=0.2, callbacks=[early_stop])


"""model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']  # Added AUC metric for better evaluation
)"""

model.summary()



# 6. Train
#early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
#model.fit(X, y, epochs=10, verbose=1, validation_split=0.2, callbacks=[early_stop])


# 6.B STORE MODEL
model.save('model.keras')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 50)           500000    
                                                                 
 conv1d_3 (Conv1D)           (None, 98, 128)           19328     
                                                                 
 global_max_pooling1d_3 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 dense_13 (Dense)            (None, 32)                4128      
                                                                 
 dense_14 (Dense)            (None, 16)               

This model is another alternative from some models used in the following paper: https://arxiv.org/pdf/1702.01923
Here instead of 50 dimensions for the word embedding they use GloVe for 300 dimensions - their results of accuracy are good but here we are not achieving that performance - I only changed the batch size for the optimal size given in the paper for this model

In [8]:
# other model using GRU alternative - according with the paper: Comparative Study of CNN and RNN for Natural Language Processing the ideal batch size is 50
gru_units = 256
gru_model = models.Sequential([
    layers.InputLayer(input_shape=(max_len,)),
    layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    ),
    layers.SpatialDropout1D(0.2),  # Dropout for embeddings
    layers.Bidirectional(layers.GRU(gru_units, return_sequences=False)),
    layers.Dropout(0.5),
    layers.Dense(100, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(y.shape[1], activation='sigmoid') 
])
gru_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model = gru_model

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X, y, epochs=10, verbose=1, validation_split=0.2, callbacks=[early_stop])

model.summary()


Epoch 1/10

KeyboardInterrupt: 