In [24]:
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
from textblob.sentiments import NaiveBayesAnalyzer
import nltk

In [25]:
import os
os.chdir(os.getcwd())

In [26]:
import pytreebank
import sys

dataset = pytreebank.load_sst()

In [27]:
out_path = os.path.join(sys.path[0], 'sst_{}.txt')
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] + 1,
                item.to_labeled_lines()[0][1]
            ))

In [28]:
train = pd.read_csv('sst_train.txt', sep='\t',header=None, names=['sentiment_label', 'sentence'])
train.head()

Unnamed: 0,sentiment_label,sentence
0,__label__4,The Rock is destined to be the 21st Century 's...
1,__label__5,The gorgeously elaborate continuation of `` Th...
2,__label__4,Singer/composer Bryan Adams contributes a slew...
3,__label__3,You 'd think by now America would have had eno...
4,__label__4,Yet the act is still charming here .


### Train Model

In [69]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity

In [70]:
train['textblob_score'] = train['sentence'].apply(textblob_score)
train['textblob_pred'] = pd.cut(train['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
train = train.drop('textblob_score', axis=1)
train.head()

Unnamed: 0,sentiment_label,sentence,textblob_pred
0,__label__4,The Rock is destined to be the 21st Century 's...,4
1,__label__5,The gorgeously elaborate continuation of `` Th...,4
2,__label__4,Singer/composer Bryan Adams contributes a slew...,3
3,__label__3,You 'd think by now America would have had eno...,3
4,__label__4,Yet the act is still charming here .,5


In [86]:
data = pd.read_csv('Amazon Book Reviews.csv', usecols=[0,1,2,3,5,6])
data.head()

Unnamed: 0,reviewerID,asin,Title,reviewerName,reviewText,overall
0,A15Q7ABIU9O9YZ,60554800,Red Tide,Larry Scantlebury,This is my first GM Ford book and I will read ...,3
1,AUIJDXNYVTEA8,60554800,Red Tide,Les Stockton,I liked the story. I thought the book added a...,4
2,A20N5GOON55TE9,60554800,Red Tide,lila,"As always, G.M. Ford does not disappoint. I st...",5
3,A1CT8ENDZSYTX3,60554800,Red Tide,Lisa B.,I love Ford's Leo Waterman series and the firs...,3
4,A2SI6BNK5SWSMD,60554800,Red Tide,L. J. Roberts,It was nice to see Corso working with the poli...,3


In [87]:
reviews = list(data['reviewText'])

In [88]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [89]:
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
en_stopwords = set(stopwords.words('english'))
ps = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [90]:
def getCleanComment(review):
    review = str(review)
    review = review.lower()
    review = review.replace('"','')
    review = review.replace(';','')
    review = review.replace('_','')
    review = review.replace('-','')
    review = review.replace(',','')
    review = re.sub('\d', '', review)
    tokens = tokenizer.tokenize(review)
    new_tokens = [i for i in tokens if i not in en_stopwords]
    stemmed_tokens = [ps.stem(i) for i in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [91]:
preprocessed_reviews = [getCleanComment(i) for i in reviews]

In [95]:
data['preprocessed_reviews'] = preprocessed_reviews
data['textblob_score'] = data["preprocessed_reviews"].apply(textblob_score)
data['textblob_pred'] = pd.cut(data['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
data.tail()

Unnamed: 0,reviewerID,asin,Title,reviewerName,reviewText,overall,preprocessed_reviews,textblob_score,textblob_pred
7431,A27D52RUOADQVR,60582006,"A Good Yarn (Blossom Street, No. 2)",Mikew,Love getting to know each of their lives. She ...,5,lov getting know liv creat charact mind help b...,0.0,3
7432,A93B8XMH93D1R,60582006,"A Good Yarn (Blossom Street, No. 2)","miss demeaner ""BOOK SMARTS""","UNLIKE SOME OF REVIEWERS ON HERE, I WON'T END ...",2,unlik review end typing ent chapt opinion get ...,0.321667,4
7433,A2JEMVYOM5IFU0,60582006,"A Good Yarn (Blossom Street, No. 2)",Norah E. Sempelsz,"I could not put this book down, I was so mired...",5,could put book mired liv peopl visit blossom s...,0.125,3
7434,AX64AMAD1RTMI,60582006,"A Good Yarn (Blossom Street, No. 2)",Robin Wren,"I have read Debbie Macomber's ""The Shop on Blo...",4,read debbi macomb shop blossom street thought ...,0.421157,4
7435,A1Y0Y74B0POAA6,60582006,"A Good Yarn (Blossom Street, No. 2)",Sharon Galligar Chance,There's something so soothing about knitting -...,5,something soothing knitting sweet clicking nee...,0.299351,4


### Using Vader

In [96]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [97]:
data['vader_score'] = data["preprocessed_reviews"].apply(lambda Description: sid.polarity_scores(Description))
data.tail()

Unnamed: 0,reviewerID,asin,Title,reviewerName,reviewText,overall,preprocessed_reviews,textblob_score,textblob_pred,vader_score
7431,A27D52RUOADQVR,60582006,"A Good Yarn (Blossom Street, No. 2)",Mikew,Love getting to know each of their lives. She ...,5,lov getting know liv creat charact mind help b...,0.0,3,"{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'comp..."
7432,A93B8XMH93D1R,60582006,"A Good Yarn (Blossom Street, No. 2)","miss demeaner ""BOOK SMARTS""","UNLIKE SOME OF REVIEWERS ON HERE, I WON'T END ...",2,unlik review end typing ent chapt opinion get ...,0.321667,4,"{'neg': 0.049, 'neu': 0.694, 'pos': 0.257, 'co..."
7433,A2JEMVYOM5IFU0,60582006,"A Good Yarn (Blossom Street, No. 2)",Norah E. Sempelsz,"I could not put this book down, I was so mired...",5,could put book mired liv peopl visit blossom s...,0.125,3,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp..."
7434,AX64AMAD1RTMI,60582006,"A Good Yarn (Blossom Street, No. 2)",Robin Wren,"I have read Debbie Macomber's ""The Shop on Blo...",4,read debbi macomb shop blossom street thought ...,0.421157,4,"{'neg': 0.04, 'neu': 0.661, 'pos': 0.299, 'com..."
7435,A1Y0Y74B0POAA6,60582006,"A Good Yarn (Blossom Street, No. 2)",Sharon Galligar Chance,There's something so soothing about knitting -...,5,something soothing knitting sweet clicking nee...,0.299351,4,"{'neg': 0.043, 'neu': 0.728, 'pos': 0.229, 'co..."


In [98]:
data['compound'] = data['vader_score'].apply(lambda score_dict: score_dict['compound'])
data['vader_pred'] = pd.cut(data['compound'], bins=5, labels=[1, 2, 3, 4, 5])
data.tail()

Unnamed: 0,reviewerID,asin,Title,reviewerName,reviewText,overall,preprocessed_reviews,textblob_score,textblob_pred,vader_score,compound,vader_pred
7431,A27D52RUOADQVR,60582006,"A Good Yarn (Blossom Street, No. 2)",Mikew,Love getting to know each of their lives. She ...,5,lov getting know liv creat charact mind help b...,0.0,3,"{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'comp...",0.4019,4
7432,A93B8XMH93D1R,60582006,"A Good Yarn (Blossom Street, No. 2)","miss demeaner ""BOOK SMARTS""","UNLIKE SOME OF REVIEWERS ON HERE, I WON'T END ...",2,unlik review end typing ent chapt opinion get ...,0.321667,4,"{'neg': 0.049, 'neu': 0.694, 'pos': 0.257, 'co...",0.9682,5
7433,A2JEMVYOM5IFU0,60582006,"A Good Yarn (Blossom Street, No. 2)",Norah E. Sempelsz,"I could not put this book down, I was so mired...",5,could put book mired liv peopl visit blossom s...,0.125,3,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",0.6908,5
7434,AX64AMAD1RTMI,60582006,"A Good Yarn (Blossom Street, No. 2)",Robin Wren,"I have read Debbie Macomber's ""The Shop on Blo...",4,read debbi macomb shop blossom street thought ...,0.421157,4,"{'neg': 0.04, 'neu': 0.661, 'pos': 0.299, 'com...",0.9493,5
7435,A1Y0Y74B0POAA6,60582006,"A Good Yarn (Blossom Street, No. 2)",Sharon Galligar Chance,There's something so soothing about knitting -...,5,something soothing knitting sweet clicking nee...,0.299351,4,"{'neg': 0.043, 'neu': 0.728, 'pos': 0.229, 'co...",0.9982,5


In [104]:
deselectlist =['reviewerID', 'asin' , 'Title', 'reviewerName', 'reviewText', 'overall', 'vader_pred']
#selectlist =[x for x in data.columns if x not in deselectlist]
#datatowrite = data[selectlist]
data.loc[:,deselectlist].to_csv('newAmazonBooksRatings.csv', index=False)
#datatowrite.to_csv('newAmazonBooksRatings.csv')