In [1]:
import os
import tensorflow as tf
from tensorflow import keras
import pandas as pd

# Base path for the dataset
dataset_path = 'data/aclImdb'

valid_dataset = keras.utils.text_dataset_from_directory(os.path.expanduser(dataset_path), batch_size=32)    #batch size needs to be changed here

# 1. Prepare text data from dataset
texts = []
labels = []

for text_batch, label_batch in valid_dataset:
    for text, label in zip(text_batch.numpy(), label_batch.numpy()):
        texts.append(text.decode('utf-8'))
        labels.append(label)

# Create DataFrame
validation_imdb = pd.DataFrame({
    'text': texts,
    'label': labels
})

print(validation_imdb.head())
print(validation_imdb.shape)

Found 100005 files belonging to 2 classes.


2025-05-18 17:06:12.303972: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


                                                text  label
0  I really think that Ms. Crawford ("Eva Phillip...      1
1  I missed the first 10 or so minutes of the mov...      1
2  Thank goodness not all Dutch people are that r...      0
3  I would like to start by saying I can only hop...      0
4  I watched this a few days ago, so details are ...      1
(100005, 2)


In [None]:
validation_imdb.shape

In [19]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# Initialize parameters
max_words = 10000
max_len = 100

# Initialize tokenizer
tokenizer = Tokenizer(num_words=max_words)

# Load the saved model
model = tf.keras.models.load_model('imdb_gru.keras')

#7. Prediction Pipeline
def prediction_pipeline(text, model, tokenizer, max_len):
    """
    Pipeline function that handles all preprocessing steps and returns the sentiment.
    
    Args:
        text (str): Input text to predict
        model: Trained model
        tokenizer: Tokenizer instance
        max_len: Maximum sequence length
    Returns:
        str: Either "positive" or "negative" sentiment
    """
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded, verbose=0)[0][0]
    return "positive" if prediction > 0.5 else "negative"


In [20]:
# Apply prediction pipeline to text column in batches for better performance
batch_size = 32
predictions = []

for i in range(0, len(validation_imdb), batch_size):
    batch_texts = validation_imdb['text'].iloc[i:i+batch_size].tolist()
    # Tokenize and pad the entire batch at once
    sequences = tokenizer.texts_to_sequences(batch_texts)
    padded = pad_sequences(sequences, maxlen=max_len)
    # Get predictions for the batch
    batch_predictions = model.predict(padded, verbose=0)
    # Convert probabilities to labels
    batch_labels = ['positive' if pred > 0.5 else 'negative' for pred in batch_predictions]
    predictions.extend(batch_labels)

# Assign predictions to new column
validation_imdb['pred_model_1'] = predictions

In [None]:
validation_imdb

In [42]:
import pickle

# Make sure the file exists
with open("tokenizer1.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("Top 10 tokens:", list(tokenizer.word_index.items())[:10])

Top 10 tokens: [('the', 1), ('and', 2), ('a', 3), ('of', 4), ('to', 5), ('is', 6), ('br', 7), ('in', 8), ('it', 9), ('i', 10)]


In [38]:
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x163b2ea20>

In [43]:
from tensorflow.keras.preprocessing.text import Tokenizer




#7. Prediction Pipeline
def prediction_pipeline2(text, model, tokenizer, max_len):
    """
    Pipeline function that handles all preprocessing steps and returns the sentiment.
    
    Args:
        text (str): Input text to predict
        model: Trained model
        tokenizer: Tokenizer instance
        max_len: Maximum sequence length
    Returns:
        str: Either "positive" or "negative" sentiment
    """
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    print(sequence)
    prediction = model.predict(padded, verbose=0)
    return prediction

In [44]:
prediction_pipeline2('FUCK THIS MOVIE', model, tokenizer, max_len)

[[11, 17]]


array([[0.7108332]], dtype=float32)

In [45]:
prediction_pipeline2('I LOVE THIS', model, tokenizer, max_len)

[[10, 118, 11]]


array([[0.7222979]], dtype=float32)

In [46]:
prediction_pipeline2('WORST MOVIE EVER', model, tokenizer, max_len)

[[255, 17, 129]]


array([[0.7158122]], dtype=float32)