In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


<div  style="border-radius: 10px; border: 2px solid #3498db; padding: 20px; background-color: #E8F6EF; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #17A05D; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;">One of the most common application of Natural Language Processing (NLP) is text classification especially sentiment analysis. Sentiment analysis of the IMDB dataset is the Hello World of NLP. So, let's deep dive into NLP.</h1>
</div>



<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Load IMDB dataset &#128221;</h1>
</u>

In [2]:
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
data.shape

(50000, 2)

In [4]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

<div  style="border-radius: 10px; border: 2px solid #3498db; padding: 20px; background-color: #E8F6EF; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #17A05D; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;">In this dataset their consists of 50000 moview reviews. Here, use only two sentiment positive and negative.</h1>
</div>


In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Data cleaning. Remove HTML tags, punctuation, numbers and convert into lowercase.</h1>
</u>

In [6]:
import re
import string

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text



In [7]:
data['review'] = data['review'].apply(clean_text)


In [8]:
data.head()


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Define X as reviews and y as sentiment.</h1>
</u>

In [9]:
X = data['review']
y = data['sentiment']

<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Using TfidfVectorizer and Machine learning algorithm</h1>
</u>

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (40000,) (40000,)
Testing data shape: (10000,) (10000,)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF transformed training data shape:", X_train_tfidf.shape)
print("TF-IDF transformed testing data shape:", X_test_tfidf.shape)


TF-IDF transformed training data shape: (40000, 5000)
TF-IDF transformed testing data shape: (10000, 5000)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=100)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.8939
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Using LSTM Model</h1>
</u>

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPooling1D, Dropout, Flatten, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.models import Sequential


2024-08-22 21:07:47.001913: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-22 21:07:47.002126: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-22 21:07:47.203372: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)

y_test_encoded = label_encoder.transform(y_test)


In [15]:
token = Tokenizer(num_words=5000, oov_token='<OOV>')
token.fit_on_texts(X_train)


In [16]:
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)

In [17]:
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=200)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=200)

In [18]:
rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, return_sequences=False),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

rnn_model.fit(X_train_padded, y_train_encoded, epochs=5, validation_data=(X_test_padded, y_test_encoded))



Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 289ms/step - accuracy: 0.5567 - loss: 0.6725 - val_accuracy: 0.6326 - val_loss: 0.5980
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 289ms/step - accuracy: 0.7961 - loss: 0.4257 - val_accuracy: 0.8811 - val_loss: 0.2846
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 290ms/step - accuracy: 0.9097 - loss: 0.2438 - val_accuracy: 0.8933 - val_loss: 0.2735
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 291ms/step - accuracy: 0.9325 - loss: 0.1942 - val_accuracy: 0.8865 - val_loss: 0.2695
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 288ms/step - accuracy: 0.9478 - loss: 0.1547 - val_accuracy: 0.8865 - val_loss: 0.2902


<keras.src.callbacks.history.History at 0x788b5b796c80>

In [19]:
rnn_model.summary()

<u>
    <h1 style="text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px;"><span style="color: red;"></span>Using RCNN Model</h1>
</u>

In [20]:
rcnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    Conv1D(128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rcnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rcnn_model.fit(X_train_padded, y_train_encoded, epochs=2, validation_data=(X_test_padded, y_test_encoded))
rcnn_model.summary()

Epoch 1/2
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 257ms/step - accuracy: 0.7192 - loss: 0.5029 - val_accuracy: 0.8751 - val_loss: 0.2944
Epoch 2/2
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 272ms/step - accuracy: 0.8948 - loss: 0.2575 - val_accuracy: 0.8923 - val_loss: 0.2642


In [21]:
rnn_loss, rnn_acc = rnn_model.evaluate(X_test_padded, y_test_encoded)
print(f'RNN Accuracy: {rnn_acc:.4f}')

rcnn_loss, rcnn_acc = rcnn_model.evaluate(X_test_padded, y_test_encoded)
print(f'RCNN Accuracy: {rcnn_acc:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 109ms/step - accuracy: 0.8850 - loss: 0.2995
RNN Accuracy: 0.8865
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.8899 - loss: 0.2701
RCNN Accuracy: 0.8923


In [22]:
best_model = rcnn_model  

sample_review = ["This movie was fantastic! A wonderful experience."]
sample_seq = token.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_seq, padding='post', maxlen=200)

prediction = best_model.predict(sample_padded)
print("Sentiment:", "Positive" if prediction[0] > 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432ms/step
Sentiment: Positive


In [23]:
best_model = rcnn_model  

sample_review = ["This movie was worst I ever seen!"]
sample_seq = token.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_seq, padding='post', maxlen=200)

prediction = best_model.predict(sample_padded)
print("Sentiment:", "Positive" if prediction[0] > 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Sentiment: Negative
