In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re

### Data Preparation 

In [49]:
data = pd.read_csv('Amazon Book Reviews.csv')
#data['sentiment'] = data.apply(lambda x: -1 if x['overall'] <= 2 else 0 if x['overall'] == 3 else 1, axis = 1)
data['sentiment'] = data.apply(lambda x: 0 if x['overall'] <= 3 else 1, axis = 1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A15Q7ABIU9O9YZ,60554800,Larry Scantlebury,"[2,3]",This is my first GM Ford book and I will read ...,3,"Let's keep it real, not personal",1127606400,"09 25, 2005",0
1,AUIJDXNYVTEA8,60554800,Les Stockton,"[0,2]",I liked the story. I thought the book added a...,4,I liked it,1361923200,"02 27, 2013",1
2,A20N5GOON55TE9,60554800,lila,"[0,2]","As always, G.M. Ford does not disappoint. I st...",5,Good reading,1366761600,"04 24, 2013",1
3,A1CT8ENDZSYTX3,60554800,Lisa B.,"[1,2]",I love Ford's Leo Waterman series and the firs...,3,Science Fiction or Mystery?,1122249600,"07 25, 2005",0
4,A2SI6BNK5SWSMD,60554800,L. J. Roberts,"[2,2]",It was nice to see Corso working with the poli...,3,3.5 stars - Needed a better end.,1113004800,"04 9, 2005",0


In [50]:
comment = list(data['reviewText'])
sent = list(data['sentiment'])
comment_train, comment_test, sent_train, sent_test = train_test_split(comment, sent, test_size=0.2)

In [51]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [52]:
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
en_stopwords = set(stopwords.words('english'))
#ps = PorterStemmer()
ps = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
def getCleanComment(review):
    review = str(review)
    review = review.lower()
    review = review.replace('"','')
    review = review.replace(';','')
    review = review.replace('_','')
    review = review.replace('-','')
    review = review.replace(',','')
    review = re.sub('\d', '', review)
    tokens = tokenizer.tokenize(review)
    new_tokens = [i for i in tokens if i not in en_stopwords]
    stemmed_tokens = [ps.stem(i) for i in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [54]:
comment_train = [getCleanComment(i) for i in comment_train]

### Unigram

In [55]:
cv = CountVectorizer()
comment_train_vec = cv.fit_transform(comment_train)
print(comment_train_vec.shape)
#cv.get_feature_names()

(5948, 31695)


### Multinomial NB

In [56]:
mnb = MultinomialNB()
mnb.fit(comment_train_vec,sent_train)

MultinomialNB()

#### Testing NB

In [57]:
comment_test = [getCleanComment(i) for i in comment_test]

In [58]:
comment_test_vec = cv.transform(comment_test)
print(comment_test_vec.shape)

(1488, 31695)


In [59]:
sentiment_prediction = mnb.predict(comment_test_vec)

In [60]:
from sklearn.metrics import accuracy_score
accuracy_score(sent_test, sentiment_prediction)

0.8057795698924731

### Bernaulli NB

In [61]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(comment_train_vec,sent_train)

BernoulliNB()

In [62]:
sentiment_prediction_bnb = bnb.predict(comment_test_vec)
accuracy_score(sent_test, sentiment_prediction_bnb)

0.7439516129032258

### SVM

In [63]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC(dual=False)
lsvm.fit(comment_train_vec,sent_train)



LinearSVC(dual=False)

In [64]:
sentiment_prediction_svm = lsvm.predict(comment_test_vec)
accuracy_score(sent_test, sentiment_prediction_svm)

0.7869623655913979

## Unigram + Bigram

In [65]:
cv = CountVectorizer(ngram_range=(1,2))
comment_train_vec = cv.fit_transform(comment_train)
print(comment_train_vec.shape)
#cv.get_feature_names()

(5948, 448955)


### Multinomial NB

In [66]:
mnb = MultinomialNB()
mnb.fit(comment_train_vec,sent_train)

MultinomialNB()

#### Testing NB

In [67]:
comment_test = [getCleanComment(i) for i in comment_test]
comment_test_vec = cv.transform(comment_test)
print(comment_test_vec.shape)

(1488, 448955)


In [68]:
from sklearn.metrics import accuracy_score

sentiment_prediction = mnb.predict(comment_test_vec)
accuracy_score(sent_test, sentiment_prediction)

0.7573924731182796

### Bernaulli NB

In [69]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(comment_train_vec,sent_train)
sentiment_prediction_bnb = bnb.predict(comment_test_vec)
accuracy_score(sent_test, sentiment_prediction_bnb)

0.7553763440860215

### SVM

In [70]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC(dual=False)
lsvm.fit(comment_train_vec,sent_train)
sentiment_prediction_svm = lsvm.predict(comment_test_vec)
accuracy_score(sent_test, sentiment_prediction_svm)



0.803763440860215