In [1]:
import numpy as np
import pandas as pd

# **Load IMDB dataset**

In [13]:
data = pd.read_csv('IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
data.shape

(50000, 2)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In this dataset their consists of 50000 movies reviews. Here, use only two sentiment positive and negative.

In [6]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


Data cleaning. Remove HTML tags, punctuation, numbers and convert into lowercase.


In [7]:
import re
import string

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text

In [8]:
data['review'] = data['review'].apply(clean_text)

In [10]:
data.head(20)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
5,probably my alltime favorite movie a story of ...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative ide...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


Define X as reviews and y as sentiment

In [14]:
X= data['review']
y= data['sentiment']

Using TfidfVectorizer and Machine learning algorithm

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (40000,) (40000,)
Testing data shape: (10000,) (10000,)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF transformed training data shape:", X_train_tfidf.shape)
print("TF-IDF transformed testing data shape:", X_test_tfidf.shape)

TF-IDF transformed training data shape: (40000, 5000)
TF-IDF transformed testing data shape: (10000, 5000)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=100)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8959
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



Using LSTM Model

In [18]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPooling1D, Dropout, Flatten, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.models import Sequential

In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)

y_test_encoded = label_encoder.transform(y_test)

In [21]:
token = Tokenizer(num_words=5000, oov_token='<OOV>')
token.fit_on_texts(X_train)

In [23]:
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)

In [25]:
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=200)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=200)

In [26]:
rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, return_sequences=False),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

rnn_model.fit(X_train_padded, y_train_encoded, epochs=5, validation_data=(X_test_padded, y_test_encoded))

Epoch 1/5




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 364ms/step - accuracy: 0.5706 - loss: 0.6581 - val_accuracy: 0.7624 - val_loss: 0.5657
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 369ms/step - accuracy: 0.7013 - loss: 0.5809 - val_accuracy: 0.7218 - val_loss: 0.5546
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 371ms/step - accuracy: 0.8013 - loss: 0.4555 - val_accuracy: 0.8583 - val_loss: 0.3422
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m508s[0m 376ms/step - accuracy: 0.8728 - loss: 0.3242 - val_accuracy: 0.8703 - val_loss: 0.3140
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m495s[0m 370ms/step - accuracy: 0.8874 - loss: 0.2841 - val_accuracy: 0.8750 - val_loss: 0.3054


<keras.src.callbacks.history.History at 0x7aab89d976d0>

In [27]:
rnn_model.summary()

# **Using RCNN Model**

In [28]:
rcnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    Conv1D(128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rcnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rcnn_model.fit(X_train_padded, y_train_encoded, epochs=2, validation_data=(X_test_padded, y_test_encoded))
rcnn_model.summary()

Epoch 1/2




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m539s[0m 426ms/step - accuracy: 0.7403 - loss: 0.4975 - val_accuracy: 0.8898 - val_loss: 0.2665
Epoch 2/2
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m582s[0m 442ms/step - accuracy: 0.9089 - loss: 0.2367 - val_accuracy: 0.9004 - val_loss: 0.2440


In [29]:
rnn_loss, rnn_acc = rnn_model.evaluate(X_test_padded, y_test_encoded)
print(f'RNN Accuracy: {rnn_acc:.4f}')

rcnn_loss, rcnn_acc = rcnn_model.evaluate(X_test_padded, y_test_encoded)
print(f'RCNN Accuracy: {rcnn_acc:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 119ms/step - accuracy: 0.8738 - loss: 0.3077
RNN Accuracy: 0.8750
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 114ms/step - accuracy: 0.8981 - loss: 0.2440
RCNN Accuracy: 0.9004


In [39]:
best_model = rcnn_model

sample_review = ["This movie was adoreable! A great experience."]
sample_seq = token.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_seq, padding='post', maxlen=200)

prediction = best_model.predict(sample_padded)
print("Sentiment:", "Positive" if prediction[0] > 0.5 else "Negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Sentiment: Positive


In [40]:
best_model = rcnn_model

sample_review = ["This movie was worst I ever seen!"]
sample_seq = token.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_seq, padding='post', maxlen=200)

prediction = best_model.predict(sample_padded)
print("Sentiment:", "Positive" if prediction[0] > 0.5 else "Negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
Sentiment: Negative
