#  Amazon Product Review Sentiment Analysis

In [19]:
import gensim
import os
import re
import csv
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim.models.doc2vec import TaggedDocument
import datetime

##### We need a CSV file with two columns, one actual reviews and sentiment related (Positive, Negative)

In [71]:
def get_doc_list(folder_name):    
    with open(folder_name+'\\'+ 'ReviewSentiment.csv') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        doc_list = []
        sentiments = []
        i = 0
        for row in readCSV:
            if i > 0:
                doc_list.append(row[1])
                sentiments.append(row[2])
            i+=1
        print ('Found %s documents'%(len(doc_list)))
    return doc_list,sentiments

##### Below code will clean the reviews 

In [20]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
def clean_text(text):
 
    tokenizer = TweetTokenizer()
    en_stop = get_stop_words('en')
    stemmer = SnowballStemmer("english")

    wordslist = []
    tagslist = []

    # clean and tokenize document string
    raw = text.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # remove numbers
    number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens]
    number_tokens = ' '.join(number_tokens).split()

    # stem tokens
    stemmed_tokens = [stemmer.stem(i) for i in number_tokens]

    # remove empty
    length_tokens = [i for i in stemmed_tokens if len(i) > 1]

    # remove punctuations
    punct_tokens  = [''.join(c for c in s if c not in string.punctuation) for s in length_tokens]


    return punct_tokens

#### Testing the function

In [4]:
clean_text("""PS: Here I renamed the labeled data as "Tweets_NAg.csv" for simplisity.""")

['ps', 'renam', 'label', 'data', 'tweetsnag', 'csv', 'simplis']

#### Passing the directory of our file and loading the data

In [90]:
import pandas as pd
import random

rev, senti = get_doc_list('C:\\Users\\lalit\\Dropbox\\NEU_Curriculum\\SEM5-Spring2017\\BigData-Analytics\\Final_Project\\SentimentAnalysis\\reviews\\')

Found 3000 documents


In [91]:
df = pd.DataFrame({"Reviews":rev,"Sentiments":senti})
df = df[[u'Sentiments',u'Reviews']]
df.loc[:,'Reviews'] = df.loc[:,'Reviews'].map(clean_text)

In [70]:
#df.head

In [92]:
from gensim.models.doc2vec import TaggedDocument,Doc2Vec

## Traning and Testing
TotalNum = int(df.size/2)

TestNum = round(0.2 * TotalNum)
TrainNum=TotalNum-TestNum

documents = [TaggedDocument(list(df.loc[i,'Reviews']),[i]) for i in range(0,TotalNum)]

Doc2VecTrainID = list(range(0,TotalNum))

In [75]:
documents[1:10]

[TaggedDocument(words=['cool', 'grandkid', 'five', 'seven', 'love', 'sometim', 'talk', 'littl', 'fast', 'understood'], tags=[1]),
 TaggedDocument(words=['instrument', 'might', 'greatest', 'thing', 'sinc', 'slice', 'bread', 'app', 'make', 'useless', 'lucki', 'might', 'recogn', 'password', 'router', 'tell', 'menus', 'dead', 'end', 'go', 'back', 'amazon', 'music', 'default', 'readi', 'yo', 'send', 'back', 'appl', 'devic', 'look', 'rate', 'app'], tags=[2]),
 TaggedDocument(words=['love', 'expect'], tags=[3]),
 TaggedDocument(words=['echo', 'awesom', 'cant', 'wait', 'get', 'connect', 'devic', 'light', 'light', 'thermostat', 'etc', '', 'thing', 'great', 'use', 'time', 'lot', 'fun', 'actual', 'quit', 'surpris', 'well', 'take', 'command', 'answer', 'question', 'expect', 'understand', 'can', 'even', 'alexa', 'start', 'car', 'now', 'note', 'think', 'manufactur', 'support', 'yet'], tags=[4]),
 TaggedDocument(words=['like', 'product', 'last', 'now', 'gone', 'dark', 'unrespons', 'tri', 'unplug', 'p

In [93]:
random.shuffle(Doc2VecTrainID)

trainDoc = [documents[id] for id in Doc2VecTrainID]
Labels = df.loc[:,'Sentiments']

In [94]:
import multiprocessing
cores = multiprocessing.cpu_count()
model_DM = Doc2Vec(size=400, window=8, min_count=1, sample=1e-4, negative=5, workers=cores,  dm=1, dm_concat=1 )
model_DBOW = Doc2Vec(size=400, window=8, min_count=1, sample=1e-4, negative=5, workers=cores, dm=0)

In [95]:
model_DM.build_vocab(trainDoc)
model_DBOW.build_vocab(trainDoc)

In [96]:
for it in range(0,10):
    random.shuffle(Doc2VecTrainID)
    trainDoc = [documents[id] for id in Doc2VecTrainID]
    model_DM.train(trainDoc,total_examples=model_DM.corpus_count,epochs=model_DM.iter)
    model_DBOW.train(trainDoc,total_examples=model_DBOW.corpus_count,epochs=model_DBOW.iter)

In [38]:
# DBOW predicts a random group of words in a paragraph given only its paragraph vector 

model_DBOW.similar_by_word("music")

[('sheer', 0.2054240107536316),
 ('elsewher', 0.17939051985740662),
 ('involv', 0.1777307391166687),
 ('disast', 0.16461707651615143),
 ('predefin', 0.1643693447113037),
 ('headway', 0.1566038727760315),
 ('freeway', 0.15547911822795868),
 ('credit', 0.1547764390707016),
 ('plethora', 0.15466900169849396),
 ('regard', 0.15037207305431366)]

In [39]:
# DM attempts to predict a word given its previous words and a paragraph vector. 
# Even though the context window moves across the text, the paragraph vector does not (hence distributed memory)
# and allows for some word-order to be captured

model_DM.similar_by_word("music")

[('amazonmus', 0.7000535726547241),
 ('aerosmith', 0.6887083649635315),
 ('attenu', 0.6389185190200806),
 ('iheartradio', 0.6302682757377625),
 ('goof', 0.6256952285766602),
 ('catalog', 0.625339925289154),
 ('lf', 0.6242313981056213),
 ('pandora', 0.6158062815666199),
 ('spotifi', 0.613024115562439),
 ('beyonc', 0.6058138012886047)]

In [397]:
model_DM.most_similar(positive=['biggest','small'], negative=['big'], topn=5)

[('fuller', 0.6294357180595398),
 ('altitud', 0.5691038370132446),
 ('speed', 0.5684019923210144),
 ('qualiti', 0.5675342082977295),
 ('adequ', 0.5570660829544067)]

In [48]:
model_DBOW.wv.save_word2vec_format('trained.word2vec')

# Logistic Regression

In [398]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm

random.seed(45906)
newindex = random.sample(range(0,TotalNum),TotalNum)
testID = newindex[-TestNum:]
trainID = newindex[:-TestNum]

In [399]:

train_targets_LR, train_regressors_LR = zip(*[(Labels[id], list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id])) for id in trainID])
train_regressors_LR = sm.add_constant(train_regressors_LR)
#predictor = LogisticRegression(multi_class='multinomial',solver='lbfgs')
predictor = LogisticRegression()
predictor.fit(train_regressors_LR,train_targets_LR)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [400]:
accus=[]
accu=0
test_regressors_LR = [list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id]) for id in testID]
test_regressors_LR = sm.add_constant(test_regressors_LR)
test_predictions_LR = predictor.predict(test_regressors_LR)
for i in range(0,TestNum):
    if test_predictions_LR[i]==df.loc[testID[i],u'Sentiments']:
        accu=accu+1
accus=accus+[1.0*accu/TestNum]
confusionM = confusion_matrix(test_predictions_LR,(df.loc[testID,u'Sentiments']))

In [458]:
from collections import Counter
Counter(elem[0] for elem in (df.loc[testID,u'Sentiments']))

Counter({'N': 284, 'P': 2021})

In [401]:
confusionM


array([[   4,   10],
       [ 255, 2036]])

In [402]:
accus

[0.8850325379609545]

In [407]:
accus_train =[]
accu=0
train_predictions_LR = predictor.predict(train_regressors_LR)
for i in range(0,len(train_targets_LR)):
    if train_predictions_LR[i]==train_targets_LR[i]:
        accu=accu+1
accus_train =accus_train+[1.0*accu/len(train_targets_LR)]
confusionM_Train = confusion_matrix(train_predictions_LR,train_targets_LR)

In [459]:
from collections import Counter
Counter(elem[0] for elem in train_targets_LR)

Counter({'N': 1043, 'P': 8176})

In [408]:
confusionM_Train

array([[  17,   40],
       [1026, 8136]])

In [409]:
accus_train

[0.8843692374444083]

# Support Vector Machine (SVM)

In [410]:
from sklearn import svm
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm

random.seed(5705)
newindex = random.sample(range(0,TotalNum),TotalNum)
testID = newindex[-TestNum:]
trainID = newindex[:-TestNum]

In [411]:
train_targets_SVM, train_regressors_SVM = zip(*[(Labels[id], list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id])) for id in trainID])
train_regressors_SVM = sm.add_constant(train_regressors_SVM)

In [443]:
clf = svm.SVC()
#clf = svm.SVC(gamma=0.001, C=100)
clf = svm.SVC(C=1.0, kernel='linear', gamma='auto')
svmmodel = clf.fit(train_regressors_SVM,train_targets_SVM)

In [444]:
accus=[]
accu=0
test_regressors_SVM = [list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id]) for id in testID]
test_regressors_SVM = sm.add_constant(test_regressors_SVM)
test_predictions_SVM = svmmodel.predict(test_regressors_SVM)
for i in range(0,TestNum):
    if test_predictions_SVM[i]==df.loc[testID[i],u'Sentiments']:
        accu=accu+1
accus=accus+[1.0*accu/TestNum]
confusionM = confusion_matrix(test_predictions_SVM,(df.loc[testID,u'Sentiments']))

In [457]:
from collections import Counter
Counter(elem[0] for elem in (df.loc[testID,u'Sentiments']))

Counter({'N': 284, 'P': 2021})

In [445]:
confusionM

array([[   0,    0],
       [ 284, 2021]])

In [448]:
accus

[0.8767895878524946]

In [456]:
from collections import Counter
Counter(elem[0] for elem in train_targets_SVM)

Counter({'N': 1018, 'P': 8201})

In [449]:
accus_train =[]
accu=0
train_predictions_SVM = svmmodel.predict(train_regressors_SVM)
for i in range(0,len(train_targets_SVM)):
    if train_predictions_SVM[i]==train_targets_SVM[i]:
        accu=accu+1
accus_train =accus_train+[1.0*accu/len(train_targets_SVM)]
confusionM_Train = confusion_matrix(train_predictions_SVM,train_targets_SVM)

In [450]:
confusionM_Train

array([[   0,    0],
       [1018, 8201]])

In [451]:
accus_train

[0.8895758759084499]

# SGD Classifier

In [460]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm

random.seed(49526)
newindex = random.sample(range(0,TotalNum),TotalNum)
testID = newindex[-TestNum:]
trainID = newindex[:-TestNum]

In [461]:
train_targets_SGD, train_regressors_SGD = zip(*[(Labels[id], list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id])) for id in trainID])
train_regressors_SGD = sm.add_constant(train_regressors_SGD)

In [462]:
lr = SGDClassifier(loss='log', penalty='l1')
modelSGD = lr.fit(train_regressors_SGD, train_targets_SGD)

#print 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test)

In [470]:
test_regressors_SGD = [list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id]) for id in testID]
test_regressors_SGD = sm.add_constant(test_regressors_SGD)
test_predictions_SGD = modelSGD.predict(test_regressors_SGD)

In [472]:
accus=[]
accu=0
test_predictions_SGD = modelSGD.predict(test_regressors_SGD)
for i in range(0,TGD
        accu=accu+1
accus=accus+[1.0*accu/TestNum]
confusionM = confusion_matrix(test_predictions_SGD,(df.loc[testID,u'Sentiments']))

In [473]:
confusionM

array([[   7,   22],
       [ 258, 2018]])

In [474]:
accus

[0.8785249457700651]

In [480]:
accus_train =[]
accu=0
train_predictions_SGD = svmmodel.predict(train_regressors_SGD)
for i in range(0,len(train_targets_SGD)):
    if train_predictions_SGD[i]==train_targets_SGD[i]:
        accu=accu+1
accus_train =accus_train+[1.0*accu/len(train_targets_SGD)]
confusionM_Trai = confusion_matrix(train_predictions_SGD,train_targets_SGD)

In [481]:
confusionM_Trai

array([[   0,    0],
       [1037, 8182]])

In [483]:
accus_train

[0.8875149148497667]

# Deep Belief Network

In [80]:
import pickle

In [97]:
target_DBN, dataset_DBN = zip(*[(Labels[id], list(model_DM.docvecs[id])+list(model_DBOW.docvecs[id])) for id in range(TotalNum)])

In [98]:
w = open('val', "wb")#Open the file
pickle.dump(target_DBN, w,protocol=2)#Dump the dictionary bok, the first parameter into the file object w.
w.close()

In [99]:
w = open('data', "wb")#Open the file
pickle.dump(dataset_DBN, w,protocol=2)#Dump the dictionary bok, the first parameter into the file object w.
w.close()

#### Due to compatability issues we will pass these data into a Python Virtual Environment with Python 2.7.3 and binaries installed
#### Below code is for the code completeness purpose and work only with Oython 2.7.3

In [None]:
#python -m idlelib.idle
import pickle

## loading the data
w1=open("C:\\Users\\lalit\\Documents\\val", 'rb') #Open the file
targets =pickle.load(w1) #Assign the recreated object to bok	
print "Loaded the targets"

w2=open("C:\\Users\\lalit\\Documents\\data", 'rb') #Open the file
dataset = pickle.load(w2) #Assign the recreated object to bok
print "Loaded the dataset"


from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from nolearn.dbn import DBN
import numpy as np
#import cv2

(trainX, testX, trainY, testY) = train_test_split(dataset, targets, test_size = 0.30)
X = np.array(trainX)
text_X = np.array(testX)

# train the Deep Belief Network with number of columns of all reviews (800) in our case (len(trainx[1]), 400 hidden units, 2 output units 
dbn = DBN(
	[X.shape[1], 400, 2],
	learn_rates = 0.01,
	learn_rate_decays = 0.2,
	epochs = 12,
	verbose = 1)
dbn.fit(X, trainY)

# compute the predictions for the test data and show a classification
# report
preds = dbn.predict(text_X)
print classification_report(testY, preds)