In [12]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import nltk

### 1.Load Data and Preprocessing

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
# Remove html coding
df['review'] = df['review'].str.replace('<.*?>','')

In [4]:
# Remove all puncuation and symbols
df['review'] = df['review'].str.replace('[^\w\s]','')

In [5]:
# Make everything lower case
df['review'] = df['review'].str.lower()

In [None]:
# !spacy download en_core_web_sm

In [7]:
# Remove stop words
import spacy

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
# # After seeing the word counts, update stop words
sp.Defaults.stop_words |= {'movie', 'film', 'like'}

In [8]:
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stopwords)]))

In [13]:
# Tokenize
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [14]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [15]:
df['lemma_review'] = df.review.apply(lemmatize_text)

In [16]:
df.head()

Unnamed: 0,review,sentiment,lemma_review
0,reviewers mentioned watching 1 oz episode youl...,positive,"[reviewer, mentioned, watching, 1, oz, episode..."
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn..."
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically theres family little boy jake thinks...,negative,"[basically, there, family, little, boy, jake, ..."
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,..."


### 2.Sentiment Score

In [17]:
#Create a function to get subjectivity and polarity
def getSubjectivity(text):
    
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    
    return TextBlob(text).sentiment.polarity

In [18]:
#Create two columns 'Subjectivity' and 'Polarity'
df['Subjectivity'] = df['review'].apply(getSubjectivity)
df['Polarity'] = df['review'].apply(getPolarity)

In [19]:
df.head()

Unnamed: 0,review,sentiment,lemma_review,Subjectivity,Polarity
0,reviewers mentioned watching 1 oz episode youl...,positive,"[reviewer, mentioned, watching, 1, oz, episode...",0.522282,0.025685
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...",0.592222,0.122778
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...",0.692381,0.349048
3,basically theres family little boy jake thinks...,negative,"[basically, there, family, little, boy, jake, ...",0.471429,-0.105357
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...",0.442848,0.228697


**note**-Subjectivity: 0 means Objective and 1 means Subjective

In [20]:
#Create a function to get sentiment scores
def getSIA(text):
    
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    
    return sentiment

In [22]:
#Get the sentiment scores for each review
compound = []
neg = []
pos = []
neu = []
SIA = 0

for i in range(0, len(df['review'])):
    
    SIA = getSIA(df['review'][i])
    compound.append(SIA['compound']) #compound is a score that calculates the sum of 
                                      ##all the lexicon(词典) ratings which has been 
                                      ##normalized between -1 and 1
    neg.append(SIA['neg'])
    pos.append(SIA['pos'])
    neu.append(SIA['neu'])

In [23]:
#Store the sentiment scores in the merge data set
df['Compound'] = compound
df['Negative'] = neg
df['Neutral'] = neu
df['Positive'] = pos

In [24]:
df.head()

Unnamed: 0,review,sentiment,lemma_review,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,reviewers mentioned watching 1 oz episode youl...,positive,"[reviewer, mentioned, watching, 1, oz, episode...",0.522282,0.025685,-0.9948,0.347,0.568,0.085
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...",0.592222,0.122778,0.9153,0.091,0.66,0.249
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...",0.692381,0.349048,0.9666,0.143,0.508,0.348
3,basically theres family little boy jake thinks...,negative,"[basically, there, family, little, boy, jake, ...",0.471429,-0.105357,-0.945,0.271,0.663,0.066
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...",0.442848,0.228697,0.9871,0.034,0.675,0.291


In [28]:
#save as csv file
df.to_csv('IMDB Dataset with Sentiment Score.csv', index=False)