#### Multi-Domain Sentiment Dataset
For More Details See: [Website](http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html)



In [1]:
import nltk 
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

# D:\Random ML Projects\Simple Sentiment Analysis Logistic Model\sorted_data_acl\electronics

In [2]:
# lemma: base word
wordnet_lemmatizer = WordNetLemmatizer()

# Sample Stop words
# from http://www.lextek.com/manuals/onix/stopwords1.html
# rstrip See Docs
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

stopwords

{'',
 'a',
 'about',
 'above',
 'across',
 'after',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'among',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyone',
 'anything',
 'anywhere',
 'are',
 'area',
 'areas',
 'around',
 'as',
 'ask',
 'asked',
 'asking',
 'asks',
 'at',
 'away',
 'b',
 'back',
 'backed',
 'backing',
 'backs',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'been',
 'before',
 'began',
 'behind',
 'being',
 'beings',
 'best',
 'better',
 'between',
 'big',
 'both',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'case',
 'cases',
 'certain',
 'certainly',
 'clear',
 'clearly',
 'come',
 'could',
 'd',
 'did',
 'differ',
 'different',
 'differently',
 'do',
 'does',
 'done',
 'down',
 'downed',
 'downing',
 'downs',
 'during',
 'e',
 'each',
 'early',
 'either',
 'end',
 'ended',
 'ending',
 'ends',
 'enough',
 'even',
 'evenly',
 'ever',
 'every',
 'everybody',
 'everyone',
 'everything',
 'e

### +ve and -ve Reviews

In [3]:


# load the reviews
positive_reviews = BeautifulSoup(open('D:\Random ML Projects\Simple Sentiment Analysis Logistic Model\sorted_data_acl\electronics/positive.review').read(),'lxml')
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('D:\Random ML Projects\Simple Sentiment Analysis Logistic Model\sorted_data_acl\electronics/negative.review').read(),'lxml')
negative_reviews = negative_reviews.findAll('review_text')


In [4]:
# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [5]:
positive_reviews[:3]

[<review_text>
 This Uniden phone/answering system has been trouble free since we bought it three months ago.  Here's a list of the other things about this phone that I like most:
 1.  The corded handset base unit can be used to make and receive calls even when the electricity is off.  This, along with price, was the main reason I bought this phone system.  This feature is unusual for a cordless phone system.
 2.  The controls on the answering machine are simple and intuitive enough so that you don't have to refer to the manual when you need to set the clock or change the outgoing recording.
 3.  Also, the answering machine has operated flawlessly so far.
 4.  The battery life of the cordless handset has been excellent between charges.  At least a week, sometimes more.
 5.  Voice quality of the cordless handset and the corded base handset are very good.
 
 However, there is something about this phone system that I don't like.
 --The cordless handset has a pretty limited range.  I had t

*first let's just try to tokenize the text using nltk's tokenizer
let's take the first review for example:*

In [6]:
t = positive_reviews[0]
nltk.tokenize.word_tokenize(t.text)


['This',
 'Uniden',
 'phone/answering',
 'system',
 'has',
 'been',
 'trouble',
 'free',
 'since',
 'we',
 'bought',
 'it',
 'three',
 'months',
 'ago',
 '.',
 'Here',
 "'s",
 'a',
 'list',
 'of',
 'the',
 'other',
 'things',
 'about',
 'this',
 'phone',
 'that',
 'I',
 'like',
 'most',
 ':',
 '1',
 '.',
 'The',
 'corded',
 'handset',
 'base',
 'unit',
 'can',
 'be',
 'used',
 'to',
 'make',
 'and',
 'receive',
 'calls',
 'even',
 'when',
 'the',
 'electricity',
 'is',
 'off',
 '.',
 'This',
 ',',
 'along',
 'with',
 'price',
 ',',
 'was',
 'the',
 'main',
 'reason',
 'I',
 'bought',
 'this',
 'phone',
 'system',
 '.',
 'This',
 'feature',
 'is',
 'unusual',
 'for',
 'a',
 'cordless',
 'phone',
 'system',
 '.',
 '2',
 '.',
 'The',
 'controls',
 'on',
 'the',
 'answering',
 'machine',
 'are',
 'simple',
 'and',
 'intuitive',
 'enough',
 'so',
 'that',
 'you',
 'do',
 "n't",
 'have',
 'to',
 'refer',
 'to',
 'the',
 'manual',
 'when',
 'you',
 'need',
 'to',
 'set',
 'the',
 'clock',
 'o

notice how it doesn't downcase, so It != it
not only that, but do we really want to include the word "it" anyway?
you can imagine it wouldn't be any more common in a positive review than a negative review
so it might only add noise to our model.
so let's create a function that does all this pre-processing for us

In [7]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens


## word-to-index map
so that we can create our word-frequency vectors later
let's also save the tokenized versions so we don't have to tokenize again later

In [8]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

In [9]:
# Reviews Tokenized

# +ve 

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print('+ veindexmap: ',len(word_index_map))
print('After +ve:',current_index)
print(len(positive_tokenized))
# -ve 
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


print('-ve indexmap: ',len(word_index_map))
print('After -ve:',current_index)
print(len(negative_tokenized))

+ veindexmap:  7560
After +ve: 7560
1000
-ve indexmap:  11082
After -ve: 11082
1000


In [10]:
print(current_index)

11082


In [11]:
# now let's create our input matrices

def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        #print(i)
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    
    return x

In [12]:
# total tokens
N = len(positive_tokenized) + len(negative_tokenized)
print(N)
print(len(word_index_map))

2000
11082


In [13]:
data = np.zeros((N, len(word_index_map) + 1))

In [14]:
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [17]:
# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:", model.score(Xtest, Ytest))

model.coef_[0]

Classification rate: 0.71


array([ 0.04040749,  0.0052488 ,  0.33425694, ..., -0.02437702,
       -0.02437702, -0.01010531])

In [16]:
# let's look at the weights for each word
# try it with different threshold values!



threshold = 0.5
for word, index in (word_index_map).items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

ha 0.590195505418
month -0.824051177174
unit -0.919491741945
price 2.83978614139
wa -1.8034481463
feature 0.50771967563
n't -2.11379150262
excellent 1.44632167659
week -0.786199047561
quality 1.53000486545
pretty 0.657576034538
buy -0.921493043647
video 0.57280657481
time -0.645708033936
sound 1.17261671991
home 0.537151645027
easy 1.85511908527
comfortable 0.680849055742
picture 0.658036564498
look 0.616684030444
little 0.957516284817
bad -0.761206004103
lot 0.792064055857
fit 0.514990602141
doe -1.27757172535
maybe -0.515748189939
try -0.679413341228
poor -0.809874315774
piece -0.519202889412
fast 0.969309515197
bit 0.682294511208
cheap -0.514401572359
've 0.763995303521
item -0.976915258409
company -0.557044456454
perfect 1.09572442808
highly 1.03688119928
paper 0.585026481305
using 0.759546336123
minute -0.509269938493
recommend 0.730217556076
speaker 0.892461221779
laptop 0.62819448549
customer -0.72262255593
happy 0.61533482338
returning -0.580717015395
hour -0.714652957748
sent 