# Sentiment Analyzer
Download the data from this [review](http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html), [stop words](http://www.lextek.com/manuals/onix/stopwords1.html)
Inspired from this [NLP Course](https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python)

In [1]:
# Import packages
from future.utils import iteritems
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,LinearRegression
from bs4 import BeautifulSoup

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
# Load the stopwords
# You can also use an alternative source of stopwords:
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stopwords = stopwords.words('english')
stopwords = set(w.rstrip() for w in open('./tmp/dataset/stopwords.txt'))

In [4]:
# Load the reviews
positive_reviews = BeautifulSoup(open('./tmp/dataset/sorted_data_acl/electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

In [5]:
negative_reviews = BeautifulSoup(open('./tmp/dataset/sorted_data_acl/electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [6]:
# There arte more positive reviews than negative reviews
# So let's take a random sample so we have balanced classes
# np.random.shuffle(positive_reviews)
# positive_reviews = positive_reviews[:len(negative_reviews)]

In [7]:
# We can also oversample the negative reviews
diff = len(positive_reviews) - len(negative_reviews)
idxs = np.random.choice(len(negative_reviews), size=diff)
extra = [negative_reviews[i] for i in idxs]
negative_reviews += extra
print("Length of positive reviews is: ", len(positive_reviews))
print("Length of negative reviiews is: ", len(negative_reviews))
print('Diff is: ', diff)

Length of positive reviews is:  1000
Length of negative reviiews is:  1000
Diff is:  0


In [8]:
t = positive_reviews[0]
nltk.tokenize.word_tokenize(t.text)

['I',
 'purchased',
 'this',
 'unit',
 'due',
 'to',
 'frequent',
 'blackouts',
 'in',
 'my',
 'area',
 'and',
 '2',
 'power',
 'supplies',
 'going',
 'bad',
 '.',
 'It',
 'will',
 'run',
 'my',
 'cable',
 'modem',
 ',',
 'router',
 ',',
 'PC',
 ',',
 'and',
 'LCD',
 'monitor',
 'for',
 '5',
 'minutes',
 '.',
 'This',
 'is',
 'more',
 'than',
 'enough',
 'time',
 'to',
 'save',
 'work',
 'and',
 'shut',
 'down',
 '.',
 'Equally',
 'important',
 ',',
 'I',
 'know',
 'that',
 'my',
 'electronics',
 'are',
 'receiving',
 'clean',
 'power',
 '.',
 'I',
 'feel',
 'that',
 'this',
 'investment',
 'is',
 'minor',
 'compared',
 'to',
 'the',
 'loss',
 'of',
 'valuable',
 'data',
 'or',
 'the',
 'failure',
 'of',
 'equipment',
 'due',
 'to',
 'a',
 'power',
 'spike',
 'or',
 'an',
 'irregular',
 'power',
 'supply',
 '.',
 'As',
 'always',
 ',',
 'Amazon',
 'had',
 'it',
 'to',
 'me',
 'in',
 '<',
 '2',
 'business',
 'days']

In [9]:
# let's create a function that does some pre-processing for us
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [10]:
# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [11]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
print(len(orig_reviews))
print(len(positive_tokenized))

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

1000
1000


In [12]:
print("len(word_index_map):", len(word_index_map))
print(len(orig_reviews))

len(word_index_map): 11078
2000


In [13]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [14]:

N = len(positive_tokenized) + len(negative_tokenized)
print(N)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

2000


In [15]:

# shuffle the data and create train/test splits
# try it multiple times!
print(len(orig_reviews))
print(len(data))
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

2000
2000


In [16]:
model = LogisticRegression()
# from sklearn.ensemble import AdaBoostClassifier
# model = AdaBoostClassifier()
model.fit(Xtrain, Ytrain)
print("Train Accuracy: ", model.score(Xtrain, Ytrain))
print("Test Accuracy: ", model.score(Xtest, Ytest))



Train Accuracy:  0.7894736842105263
Test Accuracy:  0.73


In [17]:
# let's look at the weights for each word
# try it with different threshold values! 
# This is used for logistic regresssion
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

unit -0.6857095404511567
bad -0.7751270487824288
cable 0.6617912018414011
time -0.7441274137679607
've 0.7887614816721545
month -0.7443900978808807
sound 1.0635737835662251
lot 0.7265657696167179
you 0.9648821927170496
n't -1.886744270201093
easy 1.8075354817028313
quality 1.3865465316642431
company -0.5264810190187412
item -0.9285575988458221
wa -1.5588968153619203
perfect 1.0168164134407585
fast 0.8925081978859608
ha 0.8231246504425964
price 2.768678345029652
value 0.507267675189047
money -1.1146868675682107
memory 0.9914582512972566
picture 0.5977102546395987
buy -0.8834390753979323
bit 0.6199125029091249
happy 0.6574491982865284
pretty 0.7171489748201827
doe -1.2405713999845693
highly 0.9655071461824608
recommend 0.6051997407180466
fit 0.533200648559974
customer -0.6813954386768624
support -0.8635750048213835
little 0.9555952489722748
sent -0.5056945005642455
returned -0.7989581125273378
excellent 1.382061291735626
love 1.2161679527203537
home 0.5673471842933068
week -0.72799208012

In [18]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.3597615153406063, pred = 0.0):

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review (prob = 0.6009433857348249, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

