In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("IMDB_Dataset.csv")

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.shape

(50000, 2)

In [6]:
type(data)

pandas.core.frame.DataFrame

In [7]:
data.tail

<bound method NDFrame.tail of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [9]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [35]:
data.replace ({"sentiment": {"positive": 1,"negative": 0}}, inplace=True)

In [36]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Embedding, LSTM 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [37]:
train_data, test_data=train_test_split(data, test_size=0.2, random_state=42)

In [38]:
train_data.size

80000

In [39]:
train_data.shape

(40000, 2)

In [40]:
test_data.shape

(10000, 2)

In [41]:
tokenizer=Tokenizer(num_words=4500)
tokenizer.fit_on_texts(train_data["review"])

In [42]:
x_train=pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
x_test=pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [43]:
print(x_train)

[[   6    1  727 ...  205  351 3856]
 [  26 2113   14 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [44]:
y_train=train_data["sentiment"]
y_test=test_data["sentiment"]

In [45]:
print(y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


In [46]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [50]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.build(input_shape=(None, 200))
model.summary()

In [51]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [52]:
model.fit(x_train, y_train, epochs = 5, batch_size = 64, validation_split = 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 274ms/step - accuracy: 0.7767 - loss: 0.4727 - val_accuracy: 0.8225 - val_loss: 0.4131
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 276ms/step - accuracy: 0.8345 - loss: 0.3845 - val_accuracy: 0.8389 - val_loss: 0.3751
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 275ms/step - accuracy: 0.8457 - loss: 0.3592 - val_accuracy: 0.8547 - val_loss: 0.3568
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 272ms/step - accuracy: 0.8863 - loss: 0.2806 - val_accuracy: 0.8781 - val_loss: 0.2964
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 312ms/step - accuracy: 0.9032 - loss: 0.2450 - val_accuracy: 0.8726 - val_loss: 0.3073


<keras.src.callbacks.history.History at 0x120e6e510>

In [54]:
model.save("IMDB_Model.keras")

In [60]:
import joblib 
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [61]:
loss, accuracy = model.evaluate(x_test, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step - accuracy: 0.8794 - loss: 0.2991


In [62]:
print(loss)

0.29906630516052246


In [63]:
print(accuracy)

0.8794000148773193


In [64]:
#Building a predictive system

In [70]:
def predictive_system(review):
    sequences = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequences, maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment
predictive_system("This movie was shitty")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


'negative'

In [71]:
predictive_system("This movie was crazy")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'negative'