In [2]:
# Sentiment Analysis with RNN - Full Pipeline using IMDB Dataset

import pandas as pd
import numpy as np
import re
import pickle

from tensorflow.keras.models import Sequential  # type: ignore
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional  # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences  # type: ignore
from tensorflow.keras.datasets import imdb  # type: ignore
from tensorflow.keras.models import load_model  # type: ignore

In [None]:
# Load IMDB dataset
vocab_size = 10000
maxlen = 200
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

# Get word index and reverse it for decoding
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

# Decode integer to text (for preprocessing reviews)
def decode_review(encoded):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded if i >= 3])

# Preprocess: pad sequences
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

# Save word index
with open("word_index.pkl", "wb") as f:
    pickle.dump(word_index, f)

# Model definition with Bidirectional LSTM for improved context understanding
model = Sequential([
    Embedding(vocab_size, 64),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model with increased epochs and silent output
model.fit(
    X_train,
    y_train,
    epochs=1,
    batch_size=16384,
    validation_data=(X_test, y_test),
    verbose=0
)

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 90ms/step - accuracy: 0.7216 - loss: 0.5123 - val_accuracy: 0.8483 - val_loss: 0.3554
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 107ms/step - accuracy: 0.9111 - loss: 0.2350 - val_accuracy: 0.8688 - val_loss: 0.3189
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 98ms/step - accuracy: 0.9349 - loss: 0.1734 - val_accuracy: 0.8677 - val_loss: 0.3189
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 117ms/step - accuracy: 0.9533 - loss: 0.1325 - val_accuracy: 0.8542 - val_loss: 0.4351
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 104ms/step - accuracy: 0.9638 - loss: 0.1036 - val_accuracy: 0.8636 - val_loss: 0.4224
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 103ms/step - accuracy: 0.9750 - loss: 0.0754 - val_accuracy: 0.8534 - val_loss: 0.5416
Epoch 7/10


In [12]:
# Load word index for inference
def load_word_index():
    with open("word_index.pkl", "rb") as f:
        return pickle.load(f)

# Encode custom review using IMDB word index
def encode_review_custom(text, word_index, maxlen=200, vocab_limit=10000):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    encoded = [word_index.get(word, 2) for word in tokens]  # 2 = OOV token

    # Limit to vocab range (as per embedding input_dim)
    encoded = [i if i < vocab_limit else 2 for i in encoded]
    return pad_sequences([encoded], maxlen=maxlen)

# Predict on new reviews
custom_reviews = [
    "The movie was barely satisfactory",
    "I loved it just a little bit.",
    "Amazing movie! Definitely worth watching.",
    "It was fine, but nothing special or thrilling.",
    "I didn’t like it at all and wouldn’t tell others to watch it.",
    "I hated it only a little.",
    "The movie had amazing characters"
]

word_index = load_word_index()
knowledge_graphs = load_model("sentiment_rnn_model_imdb.keras")
results = []

for idx, review in enumerate(custom_reviews, start=1):
    encoded = encode_review_custom(review, word_index)
    prediction = knowledge_graphs.predict(encoded, verbose=0)
    score = float(prediction[0][0])
    if score > 0.65:
        sentiment = "Positive 😄"
    elif score < 0.35:
        sentiment = "Negative 😞"
    else:
        sentiment = "Neutral 😐"
    results.append({
        "S.No": idx,
        "Review": review,
        "Sentiment Score": f"{score:.2%}",
        "Sentiment": sentiment
    })

result_df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(result_df)

Unnamed: 0,S.No,Review,Sentiment Score,Sentiment
0,1,The movie was barely satisfactory,3.36%,Negative 😞
1,2,I loved it just a little bit.,51.06%,Neutral 😐
2,3,Amazing movie! Definitely worth watching.,99.51%,Positive 😄
3,4,"It was fine, but nothing special or thrilling.",0.77%,Negative 😞
4,5,I didn’t like it at all and wouldn’t tell others to watch it.,1.30%,Negative 😞
5,6,I hated it only a little.,57.96%,Neutral 😐
6,7,The movie had amazing characters,97.80%,Positive 😄
