In [None]:
#importing libraries
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional

In [4]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
#function to clean text data
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()   # remove HTML tags
    text = re.sub("[^a-zA-Z]", " ", text)                  # keep only letters
    text = text.lower()                                    # lowercase
    return text

data["review"] = data["review"].apply(clean_text)

In [None]:
#encoding target variable
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [None]:
# 4. Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# 5. Tokenization because neural networks work with numbers

MAX_WORDS = 10000   # vocab size
MAX_LEN = 300       # review length

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_data["review"])

In [None]:
# Padding since lstm requires fixed length input
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=MAX_LEN)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=MAX_LEN)

Y_train = train_data["sentiment"].values
Y_test = test_data["sentiment"].values

In [None]:
# 6. Build Model

model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN)) #embedding for word vectors
model.add(SpatialDropout1D(0.2)) #spatial dropout to prevent overfitting
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))) #bidirectional lstm layer for context from both directions => better understanding

model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, Y_train,
    batch_size=64,
    epochs=6,
    validation_split=0.2,
    verbose=1
)

Epoch 1/6




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 1s/step - accuracy: 0.7043 - loss: 0.5481 - val_accuracy: 0.8079 - val_loss: 0.4258
Epoch 2/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 1s/step - accuracy: 0.8157 - loss: 0.4369 - val_accuracy: 0.7809 - val_loss: 0.4624
Epoch 3/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 1s/step - accuracy: 0.8651 - loss: 0.3398 - val_accuracy: 0.8680 - val_loss: 0.3304
Epoch 4/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 1s/step - accuracy: 0.8996 - loss: 0.2663 - val_accuracy: 0.8620 - val_loss: 0.3448
Epoch 5/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 1s/step - accuracy: 0.9001 - loss: 0.2628 - val_accuracy: 0.8528 - val_loss: 0.3674
Epoch 6/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 1s/step - accuracy: 0.9194 - loss: 0.2198 - val_accuracy: 0.8763 - val_loss: 0.3185


In [None]:
# 9. Evaluate Model

loss, accuracy = model.evaluate(X_test, Y_test, verbose=1)
print(f"\n✅ Test Accuracy: {accuracy*100:.2f}%")


# 10. building a prediction function

def predict_sentiment(review):
    review = clean_text(review)#cleaning the review
    seq = tokenizer.texts_to_sequences([review])#tokenizing the review
    padded = pad_sequences(seq, maxlen=MAX_LEN)#padding the review to make it of same length as training data
    pred = model.predict(padded)[0][0]#predicting the sentiment
    sentiment = "Positive 😀" if pred >= 0.5 else "Negative 😞"#since output is given by sigmoid func
    return sentiment, pred

print("\nExample Predictions:")
print(predict_sentiment("This movie was fantastic, I loved it!"))
print(predict_sentiment("Worst movie ever. Waste of time."))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 323ms/step - accuracy: 0.8767 - loss: 0.3102

✅ Test Accuracy: 88.00%

Example Predictions:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 913ms/step
('Positive 😀', np.float32(0.97700953))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414ms/step
('Negative 😞', np.float32(0.0050804415))


In [12]:
# Save the full model to a file
model.save("imdb_sentiment_model.h5")


