Building our own sentiment Analyzer  
Data  
https://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html  

To read  
 - Interpretation of Logistic Coefficients

In [1]:
import nltk 
import numpy as np 
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression  
import pandas as pd 

In [2]:
#importing beautifulSoup for XML parsing
from bs4 import BeautifulSoup

In [3]:
#importing word_lemmatizer 
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
#Generating all the stopwords(Already have a set of stopwords)
stopwords = set(w.rstrip() for w in open('./data/stopwords.txt'))

In [7]:
#Reading positive reviews  
positive_reviews = BeautifulSoup(open('./data/sentiment_analyzer/electronics/positive.review').read(),"lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [8]:
#Sample of positive reviews
positive_reviews[:1]

[<review_text>
 I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.
 
 I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.
 
 As always, Amazon had it to me in &lt;2 business days
 </review_text>]

In [9]:
#Reading Negative Reviews 
negative_reviews = BeautifulSoup(open('./data/sentiment_analyzer/electronics/negative.review').read(),"lxml")
negative_reviews = negative_reviews.findAll('review_text')

In [10]:
#Sample of negative reviews 
negative_reviews[:1]

[<review_text>
 cons
 tips extremely easy on carpet and if you have a lot of cds stacked at the top
 
 poorly designed, it is a vertical cd rack that doesnt have individual slots for cds, so if you want a cd from the bottom of a stack you have basically pull the whole stack to get to it
 
 putting it together was a pain, the one i bought i had to break a piece of metal just to fit it in its guide holes.
 
 again..poorly designed... doesnt even fit cds that well, there are gaps, and the cd casses are loose fitting
 
 pros
 ..........
 i guess it can hold a lot of cds....
 </review_text>]

In [11]:
#Class of positive & Negative review
print(type(positive_reviews))
print(type(negative_reviews))

<class 'bs4.element.ResultSet'>
<class 'bs4.element.ResultSet'>


In [12]:
#Shuffling Positive Review and Balancing with negative review  (so that the classifier is not biased)
np.random.shuffle(positive_reviews) 
positive_reviews = positive_reviews[:len(negative_reviews)]


In [13]:
#Creating Dictionary(Most important Code)

def my_tokenizer(s): 
    s =s.lower()   #Lowercasing all the words
    tokens = nltk.tokenize.word_tokenize(s)  #Tokenize the test
    tokens = [t for t in tokens if len(t)>2] #only word len greater than 2 is useful 
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] 
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

positive_tokenized = [] 
negative_tokenized = []

word_index_map = {} 
current_index = 0  

'''Reading all the reviews(positive+negative) and tokenizing them and adding 
to dictionary''' 
for review in positive_reviews: 
    tokens = my_tokenizer(review.text) 
    positive_tokenized.append(tokens)
    for token in tokens: 
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1

for review in negative_reviews: 
    tokens = my_tokenizer(review.text)  
    negative_tokenized.append(tokens)
    for token in tokens: 
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1  
            

In [14]:
'''Vecotrizing all the reviews as per Hashmap created Above'''            
def tokens_to_vector(tokens,label):
    x = np.zeros(len(word_index_map)+1) 
    for t in tokens:
        i = word_index_map[t]
        x[i] +=1  
    x=x/x.sum() #Dividing by total no of words to get frequency
    x[-1] = label 
    return x
    
N = len(positive_tokenized) + len(negative_tokenized) 
data = np.zeros((N,len(word_index_map)+1))
i=0 

for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens,1)   
    data[i,:] = xy
    i +=1 
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens,0)   
    data[i,:] = xy
    i +=1  

In [15]:
data.shape

(2000, 11092)

In [16]:
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11082,11083,11084,11085,11086,11087,11088,11089,11090,11091
0,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
#Shuffling the Dataset
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1] 

Xtrain = X[:-100,]  
Ytrain = Y[:-100,] 
Xtest = X[-100:,]
Ytest = Y[-100:,]  

In [24]:
#Fitting Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(Xtrain,Ytrain)
print("Classification Rate",model.score(Xtest,Ytest))

Classification Rate 0.78


In [25]:
#Interpreting each word coefficient in logistic regression
threshold = 0.5 
for word,index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight <-threshold: 
        print(word,weight) 
    

try -0.632834489656
quality 1.48316681818
bit 0.631458153886
love 1.20224822236
ha 0.692653358215
you 0.927154411809
returned -0.798111049778
comfortable 0.67699433972
returning -0.538321093051
paper 0.610456451134
stopped -0.531356631512
waste -0.997220626004
cable 0.57562817939
easy 1.71028892862
space 0.514029674704
this -0.505372769283
value 0.510722942708
n't -1.91667978498
laptop 0.519164633743
pretty 0.758727773898
fast 0.942121103523
wa -1.73732892904
return -1.18556940429
excellent 1.40768104498
junk -0.534164294678
tried -0.724317348768
money -1.09459245024
week -0.702914264907
highly 0.888158812949
unit -0.589855093288
expected 0.557581753837
called -0.505371445335
card -0.522243217123
happy 0.58221948179
little 0.994786240633
bad -0.766422975291
speaker 0.883976815401
item -0.949091620661
poor -0.784861264155
customer -0.637930421638
perfect 1.01623949028
warranty -0.612449647495
recommend 0.532743856313
then -1.1604944055
refund -0.575304636178
price 2.85886502676
using 0.

In [None]:
%reset