In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
file_path = "IMDB Dataset.csv"
data = pd.read_csv(file_path)

# Load Data Set

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [34]:
data["sentiment"] = data["sentiment"].replace({"positive" : 1,"negative" :0})

In [35]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [36]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [37]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [38]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [39]:
train_data.shape

(40000, 2)

In [40]:
test_data.shape

(10000, 2)

In [41]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

In [42]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [43]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [44]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [45]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

Y_train

# Model Training

In [46]:
model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim=128, input_length = 200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

In [47]:
model.summary()

In [48]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [49]:
model.fit(X_train, Y_train, epochs= 5, batch_size= 64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 473ms/step - accuracy: 0.7803 - loss: 0.4666 - val_accuracy: 0.8191 - val_loss: 0.4018
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 452ms/step - accuracy: 0.8382 - loss: 0.3777 - val_accuracy: 0.8611 - val_loss: 0.3314
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 597ms/step - accuracy: 0.8668 - loss: 0.3260 - val_accuracy: 0.8611 - val_loss: 0.3348
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 936ms/step - accuracy: 0.8895 - loss: 0.2767 - val_accuracy: 0.8661 - val_loss: 0.3496
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 705ms/step - accuracy: 0.9033 - loss: 0.2466 - val_accuracy: 0.8550 - val_loss: 0.3671


<keras.src.callbacks.history.History at 0x1a0776df380>

In [50]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 54ms/step - accuracy: 0.8607 - loss: 0.3514


In [51]:
print(loss)

0.35138267278671265


In [52]:
print(accuracy)

0.8607000112533569


# Building Predictive System

In [53]:
def predictive_system(review):
  sequences = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positve" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [54]:
predictive_system("This movie was fantastic and amazing")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


'positve'

In [55]:
predictive_system("A trilling adventure with stunning visual")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407ms/step


'positve'

In [56]:
predictive_system("A visual masterpiece")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 569ms/step


'positve'

In [57]:
predictive_system("Overall long and slow")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 582ms/step


'negative'

# Saving Model

In [58]:
model.save("model.h5")



In [59]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']