In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 82% 21.0M/25.7M [00:00<00:00, 72.5MB/s]
100% 25.7M/25.7M [00:00<00:00, 71.9MB/s]


In [None]:
!unzip /content/imdb-dataset-of-50k-movie-reviews.zip

Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
import pandas as pd
reviews = pd.read_csv('/content/IMDB Dataset.csv')
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Preprocess the reviews

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('punkt')
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:

import re

def preprocess_text(text):

  # Remove HTML tags
  text = re.sub('<[^<]+?>', '', text)

  # Convert to lowercase
  text = text.lower()

  # Tokenize the text
  tokens = nltk.word_tokenize(text)

  # Remove stopwords and punctuation
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

  # Lemmatize the words
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Join the tokens back into a string
  processed_text = ' '.join(tokens)

  return processed_text


In [None]:
text = reviews['review']
processed_text = text.apply(preprocess_text)

In [None]:
reviews['processed_review'] = processed_text
reviews.head()

Unnamed: 0,review,sentiment,processed_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode h...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


#### Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Tokenizer=Tokenizer(num_words=5000)
Tokenizer.fit_on_texts(processed_text)

In [None]:
tokens = Tokenizer.texts_to_sequences(processed_text)

In [None]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(Tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
#calculate the average length of a review

total = 0
count = 0
for token in tokens:
  total += len(token)
  count += 1
avg = total/count
print(avg)

96.2949


In [None]:
x = pad_sequences(tokens , maxlen=100)
x.shape

(50000, 100)

In [None]:
labels = reviews['sentiment']
encoded_labels = [1 if label == "positive" else 0 for label in labels]
encoded_labels[3]

0

In [None]:
import numpy as np
y = np.array(encoded_labels)
y.shape

(50000,)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x ,y ,test_size=0.2 ,random_state=42)

In [None]:
X_train.shape

(40000, 100)

#### Build the model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dropout

model=Sequential([
    Input(shape=(100,)),
    Embedding(5000,100,input_length=100),
    LSTM(128,return_sequences=True),
    Dropout(0.4),
    LSTM(64),
    Dropout(0.2),
    Dense(2,activation='softmax')
])



In [None]:
model.summary()

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile( loss=SparseCategoricalCrossentropy()
, optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train,y_train, epochs=20,
          batch_size=32,
          validation_data=(X_test,y_test)
)

Epoch 1/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.8000 - loss: 0.4142 - val_accuracy: 0.8659 - val_loss: 0.3573
Epoch 2/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 12ms/step - accuracy: 0.9008 - loss: 0.2496 - val_accuracy: 0.8753 - val_loss: 0.2900
Epoch 3/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9279 - loss: 0.1898 - val_accuracy: 0.8787 - val_loss: 0.2938
Epoch 4/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.9446 - loss: 0.1542 - val_accuracy: 0.8740 - val_loss: 0.3376
Epoch 5/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.9595 - loss: 0.1177 - val_accuracy: 0.8657 - val_loss: 0.4381
Epoch 6/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.9706 - loss: 0.0859 - val_accuracy: 0.8616 - val_loss: 0.4344
Epoc

<keras.src.callbacks.history.History at 0x7bff77e1a020>

In [None]:
model.save("my_sentiment_model.keras")

In [None]:
prediction = model.predict(X_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [None]:
print(y_test[0])

1


In [None]:
print(prediction[0])

[9.999751e-01 2.495733e-05]


In [None]:
print(np.argmax(prediction[0]))

0


In [None]:
def preprocess_given_text(text):
    sequence = Tokenizer.texts_to_sequences([text])

    padded_sequence = pad_sequences(sequence, maxlen=100)

    return padded_sequence

In [None]:
import matplotlib.pyplot as plt
import io
import base64

def predict_sentiment(text):
   padded_sequence = preprocess_given_text(text)

   prediction = model.predict(padded_sequence)

   if prediction[0][0] > prediction[0][1]:
       sentiment = "Negative"
       probability = prediction[0][0]
       return sentiment, (f"Negative {probability}, \n Positive: {1 - probability}")
   else:
       sentiment = "Positive"
       probability = prediction[0][1]
       return sentiment, (f"Positive {probability}, \n Negative: {1 - probability}")


In [None]:
predict_sentiment("This film had a creative plot, the plot is so well made.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


('Positive', 'Positive 0.9919330477714539, \n Negative: 0.008066952228546143')

In [None]:
predict_sentiment("Inventive, gorgeously animated, and powerfully moving")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


('Positive',
 'Positive 0.9997363686561584, \n Negative: 0.00026363134384155273')

In [None]:
predict_sentiment("It’s so earnest, bringing notes of freshness and innocence to a prequel that, by all rights, shouldn’t have had any. ")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


('Positive',
 'Positive 0.9998492002487183, \n Negative: 0.00015079975128173828')