**Sentiment Analysis of IMDB Movie Reviews**

**Problem Statement:**

In this, we have to predict the number of positive and negative reviews based on sentiments by using different classification models.

**Import necessary libraries**

In [2]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import Levenshtein

import os
print(os.listdir("../SentimentAnalysis/input"))
import warnings
warnings.filterwarnings('ignore')

# Download necessary corpora
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')
nltk.download('stopwords')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../SentimentAnalysis/input'

**Import the training dataset**

In [None]:
#importing the training data
imdb_data=pd.read_csv('../SentimentAnalysis/input/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

**Exploratery data analysis**

In [None]:
#Summary of the dataset
imdb_data.describe()

**Sentiment count**

In [None]:
#sentiment count
imdb_data['sentiment'].value_counts()

We can see that the dataset is balanced.

**Spliting the training dataset**

In [None]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

**Text normalization**

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

**Removing html strips and noise text**

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

**Removing special characters**

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

**Text stemming**

In [None]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

**Removing stopwords**

In [None]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

**Normalized train reviews**

In [None]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]
#convert dataframe to string
#norm_train_string=norm_train_reviews.to_string()
#Spelling correction using Textblob
#norm_train_spelling=TextBlob(norm_train_string)
#norm_train_spelling.correct()
#Tokenization using Textblob
#norm_train_words=norm_train_spelling.words
#norm_train_words

**Normalized test reviews**

In [None]:
#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45005]
# #convert dataframe to string
# norm_test_string=norm_test_reviews.to_string()
# # spelling correction using Textblob
# norm_test_spelling=TextBlob(norm_test_string)
# print(norm_test_spelling.correct())
# # Tokenization using Textblob
# norm_test_words=norm_test_spelling.words
# norm_test_words

In [None]:
sid = SentimentIntensityAnalyzer()

def most_different_word_lexical(control_word, word_list):
    differences = {}
    for word in word_list:
        distance = Levenshtein.distance(control_word, word)
        differences[word] = distance

    # Return the word with the highest distance
    return max(differences, key=differences.get)
def get_synonym(word):
    """Get a synonym for a word while preserving its sentiment."""
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word 

    syns = []
    for synonym in synonyms:
        lemma = synonym.lemmas()[0].name()
        if lemma != word:  
            syns.append(lemma)
    if len(syns) == 0:
        return word
    return most_different_word_lexical(word, syns)

# Test with a sentence
sentence = "The movie is absolutely fantastic and thrilling."
tokens = tokenizer.tokenize(sentence)

augmented_tokens = [get_synonym(token) for token in tokens]
augmented_sentence = ' '.join(augmented_tokens)

print("Original Sentence:", sentence)
print("Augmented Sentence:", augmented_sentence)

In [None]:
augmented_train_reviews = []
for review in norm_train_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_train_reviews.append(' '.join([get_synonym(token) for token in tokens]))
    
augmented_test_reviews = []
for review in norm_test_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_test_reviews.append(' '.join([get_synonym(token) for token in tokens]))
    

In [None]:
output_path = "normal_train_reviews.csv"
aug_train_df = pd.DataFrame(norm_train_reviews)
aug_train_df.to_csv(output_path, index=False) 

output_path = "normal_test_reviews.csv"
aug_test_df = pd.DataFrame(norm_test_reviews)
aug_test_df.to_csv(output_path, index=False) 

output_path = "augmented_train_reviews.csv"
aug_train_df = pd.DataFrame(augmented_train_reviews)
aug_train_df.to_csv(output_path, index=False) 

output_path = "augmented_test_reviews.csv"
aug_test_df = pd.DataFrame(augmented_test_reviews)
aug_test_df.to_csv(output_path, index=False)

In [None]:
augmented_train_reviews[0]

In [None]:
norm_train_reviews[0]