In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Dropout,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
data = pd.read_csv("/content/IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [32]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
sw = stopwords.words("english")

wn = WordNetLemmatizer()
def main(text):
  text = re.sub(r"https?://\S+"," ",text)  #remove punctuation
  text = re.sub(r'<.*?>', ' ', text)  #remove html tags
  text = re.sub(r'[^a-zA-Z\s]', " ", text)  #remove special characterstic
  text = text.lower().split()
  text = " ".join(wn.lemmatize(word)for word in text if word not in sw)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
data["review"] = data["review"].apply(main)

In [33]:
data["review"].apply(main)

Unnamed: 0,review
0,one reviewer mentioned watching oz episode hoo...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically family little boy jake think zombie ...
4,petter mattei love time money visually stunnin...
...,...
49995,thought movie right good job creative original...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary school nu...
49998,going disagree previous comment side maltin on...


In [35]:
data.isna().sum()

Unnamed: 0,0
review,0
sentiment,0


In [36]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [37]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["sentiment"] = le.fit_transform(data["sentiment"])

In [38]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1


In [39]:
x = data["review"]
y = data["sentiment"]

In [40]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,y_train.shape

((40000,), (40000,))

## Preprocessing

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer  #the Tokenizer with num_words=5000, it limits the vocabulary size to the top 5000 most frequent words in your dataset.
tk = Tokenizer(num_words=5000,oov_token="Word_not_found")
tk.fit_on_texts(x_train)

x_train_seq = tk.texts_to_sequences(x_train)
x_test_seq = tk.texts_to_sequences(x_test)
print(len(tk.word_index))

81798


In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences #maxlen=200 sets the maximum length for each sequence. It ensures that all input sequences have exactly 200 words (or tokens) by either:Truncating longer sequences ,Padding shorter sequences
x_train_padded = pad_sequences(x_train_seq,maxlen=200,padding="post")
x_test_padded = pad_sequences(x_test_seq,maxlen=200,padding="post")

# Model

In [45]:
model =Sequential()
model.add(Embedding(5000,128))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(64))
model.add(Dense(1,activation="sigmoid"))
model.build(input_shape=(None,128))
model.summary()

In [46]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
callback= EarlyStopping(
    monitor="val_loss",
    min_delta=0.01,
    patience=2,
    mode="auto"
    )

checkpoint = ModelCheckpoint(
    "model.h5",
    monitor="val_accuracy",
    save_best_only=True,
    mode="max"
)

In [47]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
history = model.fit(x_train_padded,y_train,validation_split=0.2,epochs=10,callbacks=[callback,checkpoint])

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349ms/step - accuracy: 0.5046 - loss: 0.6954



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 378ms/step - accuracy: 0.5046 - loss: 0.6954 - val_accuracy: 0.5225 - val_loss: 0.6924
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349ms/step - accuracy: 0.5414 - loss: 0.6841



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 390ms/step - accuracy: 0.5414 - loss: 0.6841 - val_accuracy: 0.5882 - val_loss: 0.6747
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344ms/step - accuracy: 0.6288 - loss: 0.6318



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 385ms/step - accuracy: 0.6289 - loss: 0.6317 - val_accuracy: 0.8636 - val_loss: 0.3505
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step - accuracy: 0.8736 - loss: 0.3253



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 387ms/step - accuracy: 0.8736 - loss: 0.3253 - val_accuracy: 0.8736 - val_loss: 0.3105
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 345ms/step - accuracy: 0.9072 - loss: 0.2520



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 386ms/step - accuracy: 0.9072 - loss: 0.2520 - val_accuracy: 0.8794 - val_loss: 0.2927
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step - accuracy: 0.9215 - loss: 0.2174



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 388ms/step - accuracy: 0.9215 - loss: 0.2174 - val_accuracy: 0.8809 - val_loss: 0.2991
Epoch 7/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 389ms/step - accuracy: 0.9387 - loss: 0.1836 - val_accuracy: 0.8799 - val_loss: 0.3143


In [48]:
model.save("haseeb_final.h5")



In [49]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 114ms/step - accuracy: 0.8713 - loss: 0.3334
Test Accuracy: 0.8746


In [56]:
sample_review = ["I really loved the movie! The storyline was amazing."]
sample_seq = tk.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_seq, maxlen=200, padding="post")

prediction = model.predict(sample_padded)
sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
print(f"Predicted Sentiment: {sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
Predicted Sentiment: Positive
