# Sentiment Analysis

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('./res/data/imdb_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# confirm unique values
set(df['sentiment'])

{'negative', 'positive'}

In [6]:
# convert sentiment to binary values
liked=[]

# if positive then 1, if negative then 0
for sentiment in df['sentiment']:
    if sentiment == 'positive':
        liked.append(1)
    else:
        liked.append(0)

df['sentiment'] = liked

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
# iterating through sets is faster
stopset = set(stopwords.words('english'))

# instantiate vectorizer
vector = TfidfVectorizer(
    use_idf=True,
    lowercase=True,
    strip_accents='ascii',
    stop_words=stopset)

In [13]:
# isolate the target variable
y = df['sentiment']

# convert reviews to features using vectorizer
x = vector.fit_transform(df['review'])

In [15]:
# check dimensions
print(y.shape)
print(x.shape)

(50000,)
(50000, 101865)


In [16]:
# create train/test datasets
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [17]:
# instantiate classifier
clf = MultinomialNB()
clf.fit(xtrain, ytrain)

MultinomialNB()

In [19]:
yprob = clf.predict_proba(xtest)

In [25]:
yprob[:, 1]

array([0.54294545, 0.58956153, 0.16126841, ..., 0.31515244, 0.53462388,
       0.20959944])

In [27]:
# test accuracy
roc_auc_score(ytest, yprob[:,1])

0.9419779450317185

In [28]:
my_review = ['The Godfather was an amazing movie.', 'I was a bit lukewarm about The Godfather 3.', 'Marvel movies are generally received well.']
myVector = vector.transform(my_review)

print('prediciton:', clf.predict(myVector))

prediciton: [1 0 1]
