**Sentiment Analysis of IMDB Movie Reviews**

**Problem Statement:**

In this, we have to predict the number of positive and negative reviews based on sentiments by using different classification models.

**Import necessary libraries**

In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import Levenshtein

import os
print(os.listdir("../SentimentAnalysis/input"))
import warnings
warnings.filterwarnings('ignore')

# Download necessary corpora
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')
nltk.download('stopwords')

['IMDB Dataset.csv']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Import the training dataset**

In [2]:
#importing the training data
imdb_data=pd.read_csv('../SentimentAnalysis/input/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


**Exploratery data analysis**

In [3]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


**Sentiment count**

In [4]:
#sentiment count
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

We can see that the dataset is balanced.

**Spliting the training dataset**

In [5]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


**Text normalization**

In [6]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

**Removing html strips and noise text**

In [7]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

**Removing special characters**

In [8]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

**Text stemming**

In [9]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

**Removing stopwords**

In [10]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

{'or', 'needn', 'below', 'not', 'by', 'i', 'he', 'those', 'these', 'were', 'out', "she's", 'it', 'y', 'only', 'and', 'at', 'itself', 'herself', 'do', 'into', 'be', 'was', 'then', "hasn't", 'there', 'you', 'been', 'hadn', 'wasn', 'why', 'nor', 'that', 'have', 'before', 's', 'both', 'most', 'should', "you've", 'to', 'now', 'd', 'off', "should've", 'between', 'such', 'about', 'yourself', 'of', 'o', "doesn't", 'few', 'other', 'the', 'once', 'hers', 'its', 'ourselves', 're', 've', 'won', 'own', 'who', 'each', 'theirs', 'me', 'they', 'ours', 'all', 'while', 'can', "wasn't", 'don', "mightn't", 'our', 'themselves', 'had', 'himself', 'above', 'some', "weren't", 'she', 'through', "shouldn't", 'as', "it's", 'whom', 'ma', 'hasn', 'down', "haven't", "won't", 'shan', 'has', 'over', 'mustn', 'myself', "you'd", 'his', 'weren', 'during', 'how', 'on', 'him', 'so', 'doesn', 'too', 'their', 'your', 'having', "don't", 'mightn', 'my', 'for', 'couldn', 'because', 'aren', 'which', 'yourselves', 'her', 'will',

**Normalized train reviews**

In [11]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]
#convert dataframe to string
#norm_train_string=norm_train_reviews.to_string()
#Spelling correction using Textblob
#norm_train_spelling=TextBlob(norm_train_string)
#norm_train_spelling.correct()
#Tokenization using Textblob
#norm_train_words=norm_train_spelling.words
#norm_train_words

'one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

**Normalized test reviews**

In [12]:
#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45005]
# #convert dataframe to string
# norm_test_string=norm_test_reviews.to_string()
# # spelling correction using Textblob
# norm_test_spelling=TextBlob(norm_test_string)
# print(norm_test_spelling.correct())
# # Tokenization using Textblob
# norm_test_words=norm_test_spelling.words
# norm_test_words

'read review watch thi piec cinemat garbag took least 2 page find somebodi els didnt think thi appallingli unfunni montag wasnt acm humour 70 inde ani era thi isnt least funni set sketch comedi ive ever seen itll till come along half skit alreadi done infinit better act monti python woodi allen wa say nice piec anim last 90 second highlight thi film would still get close sum mindless drivelridden thi wast 75 minut semin comedi onli world semin realli doe mean semen scatolog humour onli world scat actual fece precursor joke onli mean thi handbook comedi tit bum odd beaver niceif pubesc boy least one hand free havent found playboy exist give break becaus wa earli 70 way sketch comedi go back least ten year prior onli way could even forgiv thi film even made wa gunpoint retro hardli sketch clown subtli pervert children may cut edg circl could actual funni come realli quit sad kept go throughout entir 75 minut sheer belief may save genuin funni skit end gave film 1 becaus wa lower scoreand

In [45]:
sid.polarity_scores('annoying')['compound'] - sid.polarity_scores('irritating')['compound']

0.056900000000000006

In [65]:
sid = SentimentIntensityAnalyzer()

def most_different_word_lexical(control_word, word_list):
    differences = {}
    for word in word_list:
        distance = Levenshtein.distance(control_word, word)
        differences[word] = distance

    # Return the word with the highest distance
    return max(differences, key=differences.get)
def get_synonym(word):
    """Get a synonym for a word while preserving its sentiment."""
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word 

    syns = []
    for synonym in synonyms:
        lemma = synonym.lemmas()[0].name()
        if lemma != word:  
            syns.append(lemma)
    if len(syns) == 0:
        return word
    return most_different_word_lexical(word, syns)

def get_synonym_polar(word):
    """Get a synonym for a word while preserving its sentiment."""
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word  # Return the original word if no synonyms found
    
    original_score = sid.polarity_scores(word)['compound']
    lemma_scores = {}
    for synonym in synonyms:
        lemma = synonym.lemmas()[0].name()  # Get the synonym
        if lemma != word:  # Ensure it's not the same word
            
            synonym_score = sid.polarity_scores(lemma)['compound']
            lemma_scores[lemma] = abs(original_score - synonym_score)
            # Check if the sentiment score is close to the original
            if abs(original_score - synonym_score) < 0.1:
                return lemma
    if lemma_scores:
        syn = min(lemma_scores, key=lemma_scores.get)
        return syn
    return word
# Test with a sentence
sentence = 'one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort viewingthat get touch darker side'
tokens = tokenizer.tokenize(sentence)

augmented_tokens = [get_synonym_polar(token) for token in tokens]
augmented_sentence = ' '.join(augmented_tokens)

print("Original Sentence:", sentence)
print("Augmented Sentence:", augmented_sentence)

Original Sentence: one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack stree

In [14]:
augmented_train_reviews = []
for review in norm_train_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_train_reviews.append(' '.join([get_synonym(token) for token in tokens]))
    
augmented_test_reviews = []
for review in norm_test_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_test_reviews.append(' '.join([get_synonym(token) for token in tokens]))
    

In [50]:
augmented_train_reviews_p = []
for review in norm_train_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_train_reviews.append(' '.join([get_synonym_polar(token) for token in tokens]))
    
augmented_test_reviews_p = []
for review in norm_test_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_test_reviews.append(' '.join([get_synonym_polar(token) for token in tokens]))


0

In [66]:
augmented_train_reviews[0:5]

['matchless reappraisal hour_angle citation lookout one oz episod youll bait right_field thi exactli find methamphetamine beginning matter strike oz Washington barbarous unflinch view violenc stage_set right_field news Adam faith thi display dim center diffident thi display wrench punch attentiveness drug sexual_activity violenc hardcor authoritative function wordit Call oz nicknam give Oswald utmost secur state_of_matter penitentari concentrate mainli emerald citi experiment part prison cellular_telephone field_glass front_man expression inbound privaci senior_high_school agenda em citi dwelling manyaryan Muslim gangsta Latin_American Christian Italian Irish moreso scuffl Death gaze dodgi batch shadi agreement never Army_for_the_Liberation_of_Rwanda awayi would state chief entreaty display ascribable fact goe display wouldnt make_bold forget pretti pictur key mainstream audienc forget appeal forget romanceoz doesnt fix about beginning episod always proverb strike nasti Washington phan

In [40]:
output_path = "normal_train_reviews.csv"
aug_train_df = pd.DataFrame(norm_train_reviews)
aug_train_df.to_csv(output_path, index=False) 

output_path = "normal_test_reviews.csv"
aug_test_df = pd.DataFrame(norm_test_reviews)
aug_test_df.to_csv(output_path, index=False) 

output_path = "augmented_train_reviews.csv"
aug = pd.Series(augmented_train_reviews)
print(aug.shape)
aug.name = 'review'
aug_train_df = pd.DataFrame(aug)
print(aug_train_df.shape)
aug_train_df.to_csv(output_path, index=False) 

output_path = "augmented_test_reviews.csv"
aug = pd.Series(augmented_test_reviews)
aug.name = 'review'
aug_test_df = pd.DataFrame(aug)
aug_test_df.to_csv(output_path, index=False)

(40000,)
(40000, 1)


In [51]:
output_path = "augmented_train_reviews_p.csv"
aug = pd.Series(augmented_train_reviews_p)
print(aug.shape)
aug.name = 'review'
aug_train_df = pd.DataFrame(aug)
print(aug_train_df.shape)
aug_train_df.to_csv(output_path, index=False) 

output_path = "augmented_test_reviews_p.csv"
aug = pd.Series(augmented_test_reviews_p)
aug.name = 'review'
aug_test_df = pd.DataFrame(aug)
aug_test_df.to_csv(output_path, index=False)

(40000,)
(40000, 1)


In [34]:
aug.head()

0    matchless recapitulation hour_angle note deter...
1    curiosity littl intersection movie techniqu ve...
2    remember thi Washington curiosity direction sp...
3    BASIC famili littl male_child jake remember au...
4    petter mattei sleep_together fourth_dimension ...
Name: review, dtype: object

In [32]:
norm_train_reviews.head()

0    one review ha mention watch 1 oz episod youll ...
1    wonder littl product film techniqu veri unassu...
2    thought thi wa wonder way spend time hot summe...
3    basic famili littl boy jake think zombi hi clo...
4    petter mattei love time money visual stun film...
Name: review, dtype: object

In [57]:
aug.head()

0    beginning privation state tilt liber polit pla...
1    Washington excit understand situation_comedy w...
2    expression screen take material entir disagree...
3    wish mani consider look denni hop-picker brand...
4    thi movi Washington television sidereal_day di...
Name: review, dtype: object

In [60]:
wordnet.synsets('one')


[Synset('one.n.01'),
 Synset('one.n.02'),
 Synset('one.s.01'),
 Synset('one.s.02'),
 Synset('one.s.03'),
 Synset('one.s.04'),
 Synset('one.s.05'),
 Synset('one.s.06'),
 Synset('matchless.s.01')]

In [19]:
os.system("echo -e '\a'")

0