In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  


In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - {"not", "no", "nor"}

# Load dataset
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned'] = df['review'].apply(clean_text)

# Tokenization and Padding
MAX_LEN = 200
MAX_VOCAB = 20000

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned'])

X = tokenizer.texts_to_sequences(df['cleaned'])
X = pad_sequences(X, maxlen=MAX_LEN, padding='post', truncating='post')
y = df['sentiment'].values

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

model = Sequential([
    Embedding(MAX_VOCAB, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint = ModelCheckpoint("best_lstm_model_finetuned.h5", save_best_only=True, monitor="val_accuracy", mode="max")
early_stop = EarlyStopping(patience=6, monitor="val_loss", restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=64,
    callbacks=[checkpoint, early_stop]
)


Epoch 1/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 27ms/step - accuracy: 0.5110 - loss: 0.6931 - val_accuracy: 0.5262 - val_loss: 0.6881
Epoch 2/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.6219 - loss: 0.6352 - val_accuracy: 0.8294 - val_loss: 0.3875
Epoch 3/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.8729 - loss: 0.3279 - val_accuracy: 0.8864 - val_loss: 0.2895
Epoch 4/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9303 - loss: 0.1974 - val_accuracy: 0.8832 - val_loss: 0.3119
Epoch 5/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9598 - loss: 0.1311 - val_accuracy: 0.8851 - val_loss: 0.3596
Epoch 6/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9780 - loss: 0.0843 - val_accuracy: 0.8766 - val_loss: 0.3856
Epoch 7/20
[1m6

In [8]:
import pickle

with open('tokenizer_finetuned.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
import pickle
import numpy as np
import re
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - {"not", "no", "nor"}

MAX_LEN = 200

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

# Load model & tokenizer
model = load_model("best_lstm_model_finetuned.h5")
with open("tokenizer_finetuned.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Predict function
def predict_sentiment(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
    pred = model.predict(padded)[0][0]
    label = "Positive" if pred >= 0.5 else "Negative"
    confidence = pred if pred >= 0.5 else 1 - pred
    return label, round(confidence, 2)

sample_text = "I absolutely loved the movie! Brilliant acting and direction."
label, confidence = predict_sentiment(sample_text)
print(f"Sentiment: {label} (Confidence: {confidence})")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417ms/step
Sentiment: Positive (Confidence: 0.9200000166893005)
