<a href="https://colab.research.google.com/github/Harsh-C7/IMDB-Reviews-Sentimental-Analysis/blob/main/Setiment_analysis_on_IMDB_reviews_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install kaggle



In [25]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [26]:
kaggle = json.load(open('kaggle.json'))
os.environ['KAGGLE_USERNAME'] = kaggle['username']
os.environ['KAGGLE_KEY'] = kaggle['key']

In [27]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [28]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [29]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [30]:
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [31]:
df.shape

(50000, 2)

In [32]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [34]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [35]:
le = LabelEncoder()

df['sentiment'] = le.fit_transform(df['sentiment'])

In [36]:
df['sentiment']

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [37]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=2)

In [38]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [39]:
x_train

array([[3474,   13,  847, ...,   78,  547,  166],
       [   0,    0,    0, ...,  105, 3444,  176],
       [ 133,    6,  429, ...,  143,  155, 1198],
       ...,
       [ 195,  117,   32, ...,   27,    4,   91],
       [   0,    0,    0, ...,   19,   30,  125],
       [  38,   88, 2252, ...,   23,   30,    9]], dtype=int32)

In [40]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [41]:
model = Sequential()

model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [42]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
model.fit(x_train, y_train, batch_size=128, epochs=5, validation_split=0.2)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 264ms/step - accuracy: 0.7200 - loss: 0.5387 - val_accuracy: 0.8489 - val_loss: 0.3530
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 260ms/step - accuracy: 0.8557 - loss: 0.3459 - val_accuracy: 0.8334 - val_loss: 0.3806
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 260ms/step - accuracy: 0.8762 - loss: 0.3075 - val_accuracy: 0.8471 - val_loss: 0.3641
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 257ms/step - accuracy: 0.8751 - loss: 0.3095 - val_accuracy: 0.8583 - val_loss: 0.3520
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 253ms/step - accuracy: 0.8756 - loss: 0.3089 - val_accuracy: 0.8669 - val_loss: 0.3373


<keras.src.callbacks.history.History at 0x7ac7f9b05f60>

In [44]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step - accuracy: 0.8689 - loss: 0.3303
Test Loss: 0.3341352939605713
Test Accuracy: 0.8657000064849854


In [45]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  if prediction[0][0] > 0.5:
    return "Positive"
  else:
    return "Negative"

In [46]:
review_1 = "This is amazing move, I really liked this one."
sentiment = predict_sentiment(review_1)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step
The sentiment of the review is: Positive


In [47]:
review_2 = "This is dumb movie, boring and was not good."
sentiment = predict_sentiment(review_2)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
The sentiment of the review is: Negative
