**Sentiment Analysis of IMDB Movie Reviews**

**Problem Statement:**

In this, we have to predict the number of positive and negative reviews based on sentiments by using different classification models.

**Import necessary libraries**

In [39]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
print(os.listdir("../Documents/input"))
import warnings
warnings.filterwarnings('ignore')

# Download necessary corpora
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')
nltk.download('stopwords')

['IMDB Dataset.csv']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngame\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Import the training dataset**

In [40]:
#importing the training data
imdb_data=pd.read_csv('../Documents/input/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


**Exploratery data analysis**

In [41]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


**Sentiment count**

In [42]:
#sentiment count
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

We can see that the dataset is balanced.

**Spliting the training dataset**

In [43]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


**Text normalization**

In [44]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

**Removing html strips and noise text**

In [45]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

**Removing special characters**

In [46]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

**Text stemming**

In [47]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

**Removing stopwords**

In [48]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

{'this', 'needn', 'hasn', 'any', 'his', 'too', 'these', 'ain', 'were', 'under', 'had', 'than', "you're", 'yourself', 'all', 'while', 'have', 'below', 'ourselves', 'who', "hadn't", 'how', 'himself', 'm', "mightn't", 'such', 'should', 'i', 'hadn', 'mightn', 'above', 'not', 'shan', 'those', 'd', 'only', 'against', 'that', 'hers', "don't", "you'll", 'yours', 'being', 'as', 'our', 'up', "couldn't", 'some', "wasn't", 'be', 'did', 'couldn', 'it', 'won', 'themselves', 're', 'weren', 'same', 'your', 've', 'or', 'from', 'off', 'when', "aren't", 'yourselves', 'most', 'few', 'until', 'haven', "she's", 'out', 'then', 'can', 'wouldn', 'she', 'again', "you'd", 'both', 'didn', "it's", 'we', 'am', 's', 'an', 'so', "you've", 'but', 'once', 'nor', "won't", 'of', 'for', "doesn't", 'before', 'isn', 'don', 'by', 'through', "weren't", 'herself', 'with', 'during', 'if', 'on', "shouldn't", 'no', 'y', 'you', 't', 'was', "hasn't", 'is', 'why', 'about', "wouldn't", 'aren', 'what', 'o', 'just', 'myself', 'does', '

**Normalized train reviews**

In [49]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]
#convert dataframe to string
#norm_train_string=norm_train_reviews.to_string()
#Spelling correction using Textblob
#norm_train_spelling=TextBlob(norm_train_string)
#norm_train_spelling.correct()
#Tokenization using Textblob
#norm_train_words=norm_train_spelling.words
#norm_train_words

'one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

**Normalized test reviews**

In [57]:
#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45005]
#convert dataframe to string
norm_test_string=norm_test_reviews.to_string()
# spelling correction using Textblob
norm_test_spelling=TextBlob(norm_test_string)
print(norm_test_spelling.correct())
# Tokenization using Textblob
norm_test_words=norm_test_spelling.words
norm_test_words

KeyboardInterrupt: 

In [51]:
sid = SentimentIntensityAnalyzer()
import Levenshtein

def most_different_word_lexical(control_word, word_list):
    differences = {}
    for word in word_list:
        distance = Levenshtein.distance(control_word, word)
        differences[word] = distance

    # Return the word with the highest distance
    return max(differences, key=differences.get)
def get_synonym(word):
    """Get a synonym for a word while preserving its sentiment."""
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word 

    syns = []
    for synonym in synonyms:
        lemma = synonym.lemmas()[0].name()
        if lemma != word:  
            syns.append(lemma)
    if len(syns) == 0:
        return word
    return most_different_word_lexical(word, syns)

# Test with a sentence
sentence = "The movie is absolutely fantastic and thrilling."
tokens = tokenizer.tokenize(sentence)

augmented_tokens = [get_synonym(token) for token in tokens]
augmented_sentence = ' '.join(augmented_tokens)

print("Original Sentence:", sentence)
print("Augmented Sentence:", augmented_sentence)

Original Sentence: The movie is absolutely fantastic and thrilling.
Augmented Sentence: The movie constitute absolutely antic and shudder .


In [59]:
augmented_reviews = []
for review in norm_train_reviews:
    tokens = tokenizer.tokenize(review)
    augmented_reviews.append([get_synonym(token) for token in tokens])



KeyboardInterrupt: 

In [60]:
' '.join(augmented_reviews[0])

'matchless recapitulation hour_angle note determine one oz episod youll overcharge veracious thi exactli find methamphetamine beginning matter fall_upon oz Washington barbarous unflinch picture violenc hardening veracious discussion function confidence thi appearance dim affection diffident thi appearance perpetrate punch attentiveness drug sexual_activity violenc hardcor authoritative manipulation wordit margin_call oz nicknam establish Oswald utmost secur Department_of_State penitentari concentrate mainli emerald citi experiment department prison cellular_telephone methamphetamine battlefront expression inbound privaci senior_high_school agenda em citi dwelling manyaryan Muslim gangsta Latin_American Christian Italian Irish moreso scuffl end gaze dodgi distribute shadi agreement never Army_for_the_Liberation_of_Rwanda awayi would pronounce independent solicitation appearance ascribable fact goe appearance wouldnt make_bold forget pretti pictur key mainstream audienc forget appeal for

In [62]:
norm_train_reviews[0]

'one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

**Bags of words model**

It is used to convert text documents to numerical vectors or bag of words.

In [None]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

**Term Frequency-Inverse Document Frequency model (TFIDF)**

It is used to convert text documents to  matrix of  tfidf features.

In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)