In [1]:
import pandas as pd
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

### Importing Dataset

In [2]:
imdb=pd.read_csv(r"C:\Users\ishan\Desktop\machine learning\movie Recomandation system\dataset\movie reviews\IMDB Dataset.csv")

In [3]:
imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Data Cleaning

In [4]:
imdb["sentiment"]=imdb["sentiment"].apply(lambda x: 1 if x=="positive" else 0)

In [5]:
imdb["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [6]:
imdb["review"].iloc[1].replace("<br />","").replace("\'","").replace("-","").replace(",","").replace(":","")

'A wonderful little production. The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece. The actors are extremely well chosen Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great masters of comedy and his life. The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears. It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwells murals decorating every surface) are terribly well done.'

In [7]:
imdb["review"]=imdb["review"].apply(lambda x: x.replace("<br />","").replace("\'","").replace("-","").replace(",","").replace(":","").replace(".","").replace("(","").replace(")",""))

In [8]:
imdb["review"]

0        One of the other reviewers has mentioned that ...
1        A wonderful little production The filming tech...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis "Love in the Time of Money" is ...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    Im going to have to disagree with the previous...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

# Tokenizing

In [9]:
tokenizer = Tokenizer(num_words=5000)

In [10]:
tokenizer.fit_on_texts(imdb["review"])

In [11]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'was': 12,
 'as': 13,
 'with': 14,
 'for': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 'on': 19,
 'not': 20,
 'you': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'be': 25,
 'he': 26,
 'one': 27,
 'its': 28,
 'at': 29,
 'all': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'from': 35,
 'like': 36,
 'so': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'about': 41,
 'has': 42,
 'out': 43,
 'if': 44,
 'some': 45,
 'what': 46,
 'there': 47,
 'good': 48,
 'more': 49,
 'very': 50,
 'when': 51,
 'even': 52,
 'up': 53,
 'no': 54,
 'my': 55,
 'would': 56,
 'she': 57,
 'time': 58,
 'only': 59,
 'which': 60,
 'really': 61,
 'their': 62,
 'see': 63,
 'were': 64,
 'story': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'than': 69,
 'much': 70,
 'we': 71,
 'well': 72,
 'been': 73,
 'get': 74,
 'will': 75,
 'into': 76,
 'other': 77,
 'great': 78,
 'do': 79,
 'bad': 80,
 'because': 81,
 'people': 8

In [12]:
sequence = tokenizer.texts_to_sequences(imdb["review"])

In [13]:
len(sequence[0])

257

In [18]:
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

# padding 

In [14]:
maxlen=0
for i in sequence:
    if len(i)>maxlen:
        maxlen=len(i)
maxlen

1970

In [15]:
len(sequence[0])

257

In [16]:
pd_sequences = pad_sequences(sequence,maxlen=200,padding="post")

In [17]:
len(pd_sequences[0])

200

# Train Test Spliting

In [18]:
y=imdb.iloc[:,-1]

In [19]:
len(pd_sequences)

50000

In [20]:
x_train, x_test, y_train, y_test = train_test_split(pd_sequences, y, test_size=0.2, random_state=42)

In [21]:
len(x_train), len(y_train), len(x_test), len(y_test)

(40000, 40000, 10000, 10000)

# Definging model LSTM

In [22]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))  # Length of input sequences
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [23]:
batch_size=128
model.build(input_shape=(batch_size, 200))
model.summary()

In [24]:
#compiling model
model.compile(optimizer='adam',loss="binary_crossentropy",metrics=["accuracy"])

In [25]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) 

In [26]:
model.fit(x_train,y_train,epochs=10,batch_size=128,validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 391ms/step - accuracy: 0.5525 - loss: 0.6790 - val_accuracy: 0.5767 - val_loss: 0.6536
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 375ms/step - accuracy: 0.6330 - loss: 0.6262 - val_accuracy: 0.8048 - val_loss: 0.4749
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 594ms/step - accuracy: 0.7665 - loss: 0.5176 - val_accuracy: 0.6075 - val_loss: 0.6278
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 1s/step - accuracy: 0.7024 - loss: 0.5598 - val_accuracy: 0.8055 - val_loss: 0.4768
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 1s/step - accuracy: 0.7977 - loss: 0.4662 - val_accuracy: 0.7868 - val_loss: 0.4730
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 1s/step - accuracy: 0.8217 - loss: 0.4256 - val_accuracy: 0.8161 - val_loss: 0.4506
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x27b09d8f390>

In [27]:
loss, accuracy = model.evaluate(x_test,y_test)
loss,accuracy

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 92ms/step - accuracy: 0.8726 - loss: 0.3087


(0.3061642348766327, 0.8759999871253967)

In [None]:
s="ishant"
s.replace("i","I").replace("s","S")

In [None]:
def predict_sentiment(review):
    review=review.replace("<br />","").replace("\'","").replace("-","").replace(",","").replace(":","").replace(".","").replace("(","").replace(")","")
    sequence=tokenizer.texts_to_sequences([review])
    pd_sequence=pad_sequences(sequence,maxlen=200,padding="post")
    prediction = model.predict(pd_sequence)
    return "Positive" if prediction[0][0]>=0.5 else "Negative"

In [None]:
review = "This movie was fantastic"
predict_sentiment(review)

In [None]:
review = "This movie was not very poor"
predict_sentiment(review)

# Saving Model

In [None]:
model.save("sentiment_predictor.h5")

In [None]:
tokenizer.save("tokenizer.h5")

# TO retrieve model

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model_loaded = load_model("sentiment_predictor.h5")

In [None]:
model_loaded.summary()