<a href="https://colab.research.google.com/github/Mahadevan0507/Sentiment-Analysis-of-Movie-Reviews/blob/main/Sentiment_Analysis_of_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4815310%2F8143731%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240803%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240803T105716Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5e90bb3810ba0481011d0d48d9ac9fa7f95bd99ba4d087e66873440a5306ef69f980404fcb9d3c230e02137500533ba6baf07809a47072133da214fbe862854f67778dc8926373af962ff90b3c7beea1284eb2062a13216689ebeaff98c8320dd8a56c976d01b1c2660fc89f6af4be9065c9dc61a04a7ee09727b6cfd08a64652ccead06a7885e5e4121fd9aab7341fd8907d182b6c21411cde4b6cd3ecc18a691b176cda6894b4cfb0285f46a042fe3df51cdd808d4f6236cc03a68ac106c2f0b59e0657150f96b55907f2a2e91ac4150c855f152a13c8cf6407c9532024615e5e1bd98572929e6a29a642e47680efb97ae68b6f1231028569abfbd14c672b5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Dataset: IMDb Movie Reviews

Skills Demonstrated:

* Text Cleaning and Preprocessing
* Exploratory Data Analysis on Text Data
* Natural Language Processing (NLP) Techniques
* Sentiment Classification (Naive Bayes, SVM, LSTM)
* Model Evaluation (Accuracy, Precision, Recall, F1 Score)

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# 2. Data Loading and Overview

# Load the dataset
df = pd.read_csv('/kaggle/input/imdb-movie-reviews/IMDB Dataset.csv')

# Display the first few rows of the dataframe
df.head()

# Basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

In [None]:
# 3. Text Cleaning and Preprocessing

stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
    'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
    'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
    'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
    'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}
ps = PorterStemmer()

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stopwords and stem the words
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_review'] = df['review'].apply(preprocess_text)

# Display the first few cleaned reviews
df[['review', 'cleaned_review']].head()

In [None]:
# 4. Exploratory Data Analysis (EDA)

# Sentiment distribution
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution')
plt.show()

# Review length distribution
df['review_length'] = df['cleaned_review'].apply(lambda x: len(x.split()))
sns.histplot(df['review_length'], bins=50, kde=True)
plt.title('Review Length Distribution')
plt.show()

In [None]:
# 5. Natural Language Processing (NLP) Techniques

# Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(df['cleaned_review'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])

In [None]:
# 6. Sentiment Classification Models

y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data into training and testing sets
# X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
# X_train_tfidf, X_test_tfidf = train_test_split(X_tfidf, test_size=0.2, random_state=42)

## Naive Bayes
# nb_model = MultinomialNB()
# nb_model.fit(X_train_bow, y_train)
# y_pred_nb = nb_model.predict(X_test_bow)

## SVM
# svm_model = SVC(kernel='linear')
# svm_model.fit(X_train_tfidf, y_train)
# y_pred_svm = svm_model.predict(X_test_tfidf)

## LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_review'])
X_seq = tokenizer.texts_to_sequences(df['cleaned_review'])
X_padded = pad_sequences(X_seq, maxlen=500)

X_train_lstm, X_test_lstm, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
X_train2_lstm, X_val_lstm, y_train2, y_val = train_test_split(X_train_lstm, y_train, test_size=0.4, random_state=42)

lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=500))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = lstm_model.fit(X_train2_lstm, y_train2, epochs=1, batch_size=256, validation_data=(X_val_lstm, y_val))


In [None]:
# 7. Model Evaluation

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

## Naive Bayes Evaluation
# acc_nb, prec_nb, rec_nb, f1_nb = evaluate_model(y_test, y_pred_nb)
# print(f'Naive Bayes - Accuracy: {acc_nb}, Precision: {prec_nb}, Recall: {rec_nb}, F1 Score: {f1_nb}')

## SVM Evaluation
# acc_svm, prec_svm, rec_svm, f1_svm = evaluate_model(y_test, y_pred_svm)
# print(f'SVM - Accuracy: {acc_svm}, Precision: {prec_svm}, Recall: {rec_svm}, F1 Score: {f1_svm}')

## LSTM Evaluation
y_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5).astype("int32")
acc_lstm, prec_lstm, rec_lstm, f1_lstm = evaluate_model(y_test, y_pred_lstm)
print(f'LSTM - Accuracy: {acc_lstm}, Precision: {prec_lstm}, Recall: {rec_lstm}, F1 Score: {f1_lstm}')
