In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [11]:
data = pd.read_csv("./IMDB_Dataset.csv")

In [12]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
data.shape

(50000, 2)

In [15]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [16]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## One Hot Encoding

In [17]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [18]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Data Prepocessing

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [21]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
train_data.shape, test_data.shape

((40000, 2), (10000, 2))

In [23]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])

In [26]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)


In [27]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]],
      shape=(40000, 200), dtype=int32)

In [28]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]],
      shape=(10000, 200), dtype=int32)

In [29]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [30]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [34]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [33]:
model.summary()

In [37]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 319ms/step - accuracy: 0.8105 - loss: 0.4287 - val_accuracy: 0.7757 - val_loss: 0.4840
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 309ms/step - accuracy: 0.8620 - loss: 0.3380 - val_accuracy: 0.8469 - val_loss: 0.3572
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 319ms/step - accuracy: 0.8793 - loss: 0.2963 - val_accuracy: 0.8662 - val_loss: 0.3199
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 323ms/step - accuracy: 0.8830 - loss: 0.2850 - val_accuracy: 0.8520 - val_loss: 0.3357
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 318ms/step - accuracy: 0.9093 - loss: 0.2335 - val_accuracy: 0.8784 - val_loss: 0.3230


<keras.src.callbacks.history.History at 0x1ca4d861810>

In [38]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.8828 - loss: 0.3157


## Building Predictive System

In [41]:
def predictive_system(review):
    seq = tokenizer.texts_to_sequences([review])
    padded = pad_sequences(seq, maxlen=200)
    pred = model.predict(padded)
    if pred > 0.5:
        print("Positive Sentiment")
    else:
        print("Negative Sentiment")

In [42]:
predictive_system("The movie was fantastic! I really loved it.")
predictive_system("The movie was terrible. I hated every moment of it.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 437ms/step
Positive Sentiment
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Negative Sentiment


In [43]:
model.save("model.h5")



In [44]:
import joblib
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']