In [3]:
import pandas as pd
import numpy as np

###**Load Dataset**

In [4]:
df = pd.read_csv("/content/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


###**Encoding Sentiment column**

In [7]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
df.shape

(50000, 2)

In [10]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [11]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


##**Cleaning & Preprocessing Review Column**

In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [13]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
def preprocess_text(text):
  stemmer = PorterStemmer()
  stop_words = set(stopwords.words("english"))

  # Step 1: Lowercase
  text = text.lower()

  # Step 2: Remove Punctuations and Symbols
  text = re.sub(r"[^\w\s]", "", text)

  # Step 3: Remove numbers
  text = re.sub(r"\d+", "", text)

  # Step 4: Tokenize
  tokens = word_tokenize(text)

  # Step 5: Remove Stopwords
  tokens = [word for word in tokens if word not in stop_words]

  # Step 6: Stem
  tokens = [stemmer.stem(word) for word in tokens]

  # Join back into single strings
  cleaned_text = " ".join(tokens)

  return cleaned_text

In [15]:
preprocess_text("This is the first text entry, with punctuation! ,Here's the second sentence: it's filled with numbers like 2024 and symbols #AI.")

'first text entri punctuat here second sentenc fill number like symbol ai'

In [16]:
df['review'] = df['review'].apply(preprocess_text)

###**Performing Train Test Split**

In [17]:
X = df.iloc[:, 0:1]
y = df['sentiment']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

###**Applying BOW**

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer()

In [21]:
X_train_bow = cv.fit_transform(X_train["review"])
X_test_bow = cv.transform(X_test['review'])

In [22]:
X_train.shape

(40000, 1)

####**Multinomial Naive Bayes**

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
mb = MultinomialNB()

In [25]:
mb.fit(X_train_bow, y_train)

In [26]:
y_pred = mb.predict(X_test_bow)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [28]:
accuracy_score(y_test, y_pred)

0.855

In [29]:
confusion_matrix(y_test, y_pred)

array([[4322,  639],
       [ 811, 4228]])

##**TF-IDF**

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf = TfidfVectorizer()

In [32]:
X_train_tfidf = tfidf.fit_transform(X_train['review'])
X_test_tfidf = tfidf.transform(X_test['review'])

####**MultinomialNB**

In [33]:
mb = MultinomialNB()
mb.fit(X_train_tfidf, y_train)

In [34]:
y_pred = mb.predict(X_test_tfidf)

In [35]:
accuracy_score(y_test, y_pred)

0.862

In [36]:
confusion_matrix(y_test, y_pred)

array([[4348,  613],
       [ 767, 4272]])

##**Deep Learning Approach**

In [50]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [51]:
# Tokenize the text
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(X['review'])
X_seq = tokenizer.texts_to_sequences(X['review'])
X_pad = pad_sequences(X_seq, maxlen = 100) # Pad sequences to the same length

In [52]:
print(f'Shape of X_pad: {X_pad.shape}')
print(f'Shape of y: {y.shape}')

Shape of X_pad: (50000, 100)
Shape of y: (50000,)


In [53]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size = 0.2, random_state = 42)

In [54]:
# Check the shapes of the resulting datasets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (40000, 100)
Shape of X_test: (10000, 100)
Shape of y_train: (40000,)
Shape of y_test: (10000,)


##**RNN Model**

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [56]:
def build_rnn_model():
  model = Sequential()
  model.add(Embedding(input_dim = 5000, output_dim = 12, input_length = 100))
  model.add(SimpleRNN(64))
  model.add(Dense(1, activation = 'sigmoid'))
  return model

##**LSTM Model**

In [57]:
from tensorflow.keras.layers import LSTM

def build_LSTM_model():
  model = Sequential()
  model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
  model.add(LSTM(64))
  model.add(Dense(1, activation = 'sigmoid'))
  return model

##**GRU Model**

In [58]:
from tensorflow.keras.layers import GRU

def build_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
    model.add(GRU(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

##**Train the model**

In [59]:
def train_model(model, X_train, y_train):
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  model.fit(X_train, y_train, epochs = 10, batch_size = 32, validation_split = 0.2)

##**Evaluate Model**

In [60]:
def evaluate_model(model, X_test, y_test):
  loss, accuracy = model.evaluate(X_test, y_test)
  print(f"Test Accuracy: {accuracy:.4f}")

In [61]:
rnn_model = build_rnn_model()
train_model(rnn_model, X_train, y_train)
evaluate_model(rnn_model, X_test, y_test)

Epoch 1/10




[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.5283 - loss: 0.6922 - val_accuracy: 0.6687 - val_loss: 0.6195
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6997 - loss: 0.5851 - val_accuracy: 0.8146 - val_loss: 0.4196
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.8288 - loss: 0.3891 - val_accuracy: 0.8201 - val_loss: 0.4100
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.8603 - loss: 0.3403 - val_accuracy: 0.6826 - val_loss: 0.6115
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.6964 - loss: 0.5814 - val_accuracy: 0.8266 - val_loss: 0.4074
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.8743 - loss: 0.3093 - val_accuracy: 0.8468 - val_loss: 0.3722
Epoch 7/10
[1m

In [62]:
# LSTM
lstm_model = build_LSTM_model()
train_model(lstm_model, X_train, y_train)
evaluate_model(lstm_model, X_test, y_test)

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.7907 - loss: 0.4390 - val_accuracy: 0.8660 - val_loss: 0.3133
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9031 - loss: 0.2425 - val_accuracy: 0.8629 - val_loss: 0.3338
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.9249 - loss: 0.1957 - val_accuracy: 0.8612 - val_loss: 0.3364
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.9429 - loss: 0.1579 - val_accuracy: 0.8509 - val_loss: 0.3932
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9570 - loss: 0.1198 - val_accuracy: 0.8569 - val_loss: 0.4348
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.9674 - loss: 0.0945 - val_accuracy: 0.8550 - val_loss: 0.4513
Epoch 7/10
[

In [63]:
# GRU
gru_model = build_gru_model()
train_model(gru_model, X_train, y_train)
evaluate_model(gru_model, X_test, y_test)

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.7679 - loss: 0.4645 - val_accuracy: 0.8652 - val_loss: 0.3393
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9000 - loss: 0.2497 - val_accuracy: 0.8635 - val_loss: 0.3212
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9303 - loss: 0.1840 - val_accuracy: 0.8651 - val_loss: 0.3456
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9502 - loss: 0.1382 - val_accuracy: 0.8635 - val_loss: 0.3918
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9668 - loss: 0.0986 - val_accuracy: 0.8559 - val_loss: 0.4271
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.9744 - loss: 0.0776 - val_accuracy: 0.8519 - val_loss: 0.5137
Epoch 7/10
[1m1

In [74]:
def make_predictions(model, new_reviews):
    new_reviews_cleaned = [preprocess_text(review) for review in new_reviews]
    new_sequences = tokenizer.texts_to_sequences(new_reviews_cleaned)
    new_padded = pad_sequences(new_sequences, maxlen=100)
    predictions = model.predict(new_padded)
    if (predictions > 0.5).astype(int):
        return "The Review is Positive"
    else:
        return "The Review is Negative"
    # return (predictions > 0.5).astype(int)  # Returns 1 for positive, 0 for negative

In [77]:
new_reviews = ["I Hate this movie, Worst movie ever made"]
predictions = make_predictions(lstm_model, new_reviews)
print(predictions)  # Output predictions for new reviews

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
The Review is Negative


In [78]:
new_reviews = ["I love this movie, Everything about this movie is good"]
predictions = make_predictions(lstm_model, new_reviews)
print(predictions)  # Output predictions for new reviews

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
The Review is Positive
