In [3]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [4]:
df = pd.read_csv(r"C:\Users\kalai\Downloads\IMDB_cleaned.csv")

In [5]:
print("Dataset Shape:", df.shape)
print(df.head())


Dataset Shape: (5000, 3)
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review sentiment  
0  one of the other reviewers has mentioned that ...  positive  
1  a wonderful little production the filming tech...  positive  
2  i thought this was a wonderful way to spend ti...  positive  
3  basically there is a family where a little boy...  negative  
4  petter mattei s love in the time of money is a...  positive  


In [6]:
def clean_text(text):
    text = str(text).lower()                              # Lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)               # Remove punctuation & numbers
    text = re.sub(r"\s+", " ", text).strip()              # Remove extra spaces
    return text

df["cleaned_review"] = df["cleaned_review"].apply(clean_text)

In [7]:
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_review"], df["sentiment"],
    test_size=0.2, random_state=42
)

In [9]:
max_words = 10000     # Vocabulary size
max_len = 200         # Max sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')


In [10]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),         # Embedding layer
    Bidirectional(LSTM(128, return_sequences=False)),        # Bidirectional LSTM
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')                          # Output layer (binary classification)
])

In [11]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [12]:
model.summary()

In [13]:
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test_pad, y_test),
    verbose=1
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 731ms/step - accuracy: 0.5208 - loss: 0.6918 - val_accuracy: 0.6670 - val_loss: 0.6566
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 756ms/step - accuracy: 0.6348 - loss: 0.6647 - val_accuracy: 0.6720 - val_loss: 0.6292
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 768ms/step - accuracy: 0.7378 - loss: 0.5448 - val_accuracy: 0.7550 - val_loss: 0.5031
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 755ms/step - accuracy: 0.8526 - loss: 0.4057 - val_accuracy: 0.7990 - val_loss: 0.5191
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 762ms/step - accuracy: 0.9418 - loss: 0.1749 - val_accuracy: 0.8120 - val_loss: 0.5705
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 782ms/step - accuracy: 0.9744 - loss: 0.0916 - val_accuracy: 0.8010 - val_loss: 0.6577
Epoch 7/10
[1m63/63[

In [14]:
 loss, acc = model.evaluate(X_test_pad, y_test)
print(f"\n✅ Test Accuracy: {acc:.4f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.7980 - loss: 0.7603

✅ Test Accuracy: 0.7870


In [15]:
def predict_review(review):
    review_clean = clean_text(review)
    seq = tokenizer.texts_to_sequences([review_clean])
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred = model.predict(pad)[0][0]

    # Adjust threshold to 0.6 for better separation
    sentiment = "Positive 😀" if pred > 0.6 else "Negative 😞"
    print(f"\nReview: {review}")
    print(f"Prediction: {sentiment} (Confidence: {pred:.2f})")

In [16]:
predict_review("The film was boring and too long.")
predict_review("Absolutely fantastic movie with great acting!")
predict_review("The plot was weak, but the visuals were stunning.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 660ms/step

Review: The film was boring and too long.
Prediction: Negative 😞 (Confidence: 0.13)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step

Review: Absolutely fantastic movie with great acting!
Prediction: Positive 😀 (Confidence: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step

Review: The plot was weak, but the visuals were stunning.
Prediction: Positive 😀 (Confidence: 0.96)


In [18]:
predict_review("The film is very boring poor cgi")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step

Review: The film is very boring poor cgi
Prediction: Negative 😞 (Confidence: 0.06)


In [19]:
predict_review("The length of the film is long and no story very poo writing")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step

Review: The length of the film is long and no story very poo writing
Prediction: Negative 😞 (Confidence: 0.14)


In [20]:
predict_review("very good writing and acting")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step

Review: very good writing and acting
Prediction: Positive 😀 (Confidence: 0.73)
