In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow .keras.models import Sequential
from tensorflow .keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow .keras.preprocessing.sequence import pad_sequences
import pickle

In [32]:
#load dataset
data=pd.read_csv("IMDB Dataset.csv")

In [33]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [34]:
#replace sentiment labels with binary values
data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)

  data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)


In [35]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [36]:
#split the dataset
train_data,test_data=train_test_split(data,test_size=0.2,random_state=43)

In [37]:
#tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

In [38]:
# save the tokenizer
with open("tokenizer.pkl","wb") as f:
    pickle.dump(tokenizer,f)

In [40]:
#prepare training and testing data
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]


In [44]:
#build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [45]:
#compile the model
model.compile(optimizer="adam",loss="binary_crossentropy", metrics=["accuracy"])

In [46]:
#Train the model
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 252ms/step - accuracy: 0.7322 - loss: 0.5268 - val_accuracy: 0.8501 - val_loss: 0.3621
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m402s[0m 804ms/step - accuracy: 0.8541 - loss: 0.3514 - val_accuracy: 0.8410 - val_loss: 0.4028
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 241ms/step - accuracy: 0.8628 - loss: 0.3320 - val_accuracy: 0.8711 - val_loss: 0.3212
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 320ms/step - accuracy: 0.8828 - loss: 0.2912 - val_accuracy: 0.8761 - val_loss: 0.3154
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 249ms/step - accuracy: 0.8967 - loss: 0.2655 - val_accuracy: 0.8545 - val_loss: 0.3798


<keras.src.callbacks.history.History at 0x1e1e995c150>

In [48]:
#Evaluate the model
loss,accuracy = model.evaluate(X_test,Y_test)
print(f"Test Loss: {loss}")
print(f"test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 48ms/step - accuracy: 0.8487 - loss: 0.3927
Test Loss: 0.39794373512268066
test Accuracy: 0.8460999727249146


In [49]:
#save the model
model.save("model.h5")

