In [24]:
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
from textblob.sentiments import NaiveBayesAnalyzer
import nltk

In [25]:
import os
os.chdir(os.getcwd())

In [26]:
import pytreebank
import sys

dataset = pytreebank.load_sst()

In [27]:
out_path = os.path.join(sys.path[0], 'sst_{}.txt')
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] + 1,
                item.to_labeled_lines()[0][1]
            ))

In [28]:
train = pd.read_csv('sst_train.txt', sep='\t',header=None, names=['sentiment_label', 'sentence'])
train.head()

Unnamed: 0,sentiment_label,sentence
0,__label__4,The Rock is destined to be the 21st Century 's...
1,__label__5,The gorgeously elaborate continuation of `` Th...
2,__label__4,Singer/composer Bryan Adams contributes a slew...
3,__label__3,You 'd think by now America would have had eno...
4,__label__4,Yet the act is still charming here .


### Train Model

In [69]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity

In [70]:
train['textblob_score'] = train['sentence'].apply(textblob_score)
train['textblob_pred'] = pd.cut(train['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
train = train.drop('textblob_score', axis=1)
train.head()

Unnamed: 0,sentiment_label,sentence,textblob_pred
0,__label__4,The Rock is destined to be the 21st Century 's...,4
1,__label__5,The gorgeously elaborate continuation of `` Th...,4
2,__label__4,Singer/composer Bryan Adams contributes a slew...,3
3,__label__3,You 'd think by now America would have had eno...,3
4,__label__4,Yet the act is still charming here .,5


In [71]:
data = pd.read_csv('Amazon Book Reviews.csv')
data.head()

Unnamed: 0,reviewerID,asin,Title,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A15Q7ABIU9O9YZ,60554800,Red Tide,Larry Scantlebury,"[2,3]",This is my first GM Ford book and I will read ...,3,"Let's keep it real, not personal",1127606400,"09 25, 2005"
1,AUIJDXNYVTEA8,60554800,Red Tide,Les Stockton,"[0,2]",I liked the story. I thought the book added a...,4,I liked it,1361923200,"02 27, 2013"
2,A20N5GOON55TE9,60554800,Red Tide,lila,"[0,2]","As always, G.M. Ford does not disappoint. I st...",5,Good reading,1366761600,"04 24, 2013"
3,A1CT8ENDZSYTX3,60554800,Red Tide,Lisa B.,"[1,2]",I love Ford's Leo Waterman series and the firs...,3,Science Fiction or Mystery?,1122249600,"07 25, 2005"
4,A2SI6BNK5SWSMD,60554800,Red Tide,L. J. Roberts,"[2,2]",It was nice to see Corso working with the poli...,3,3.5 stars - Needed a better end.,1113004800,"04 9, 2005"


In [72]:
reviews = list(data['reviewText'])

In [73]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [74]:
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
en_stopwords = set(stopwords.words('english'))
ps = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
def getCleanComment(review):
    review = str(review)
    review = review.lower()
    review = review.replace('"','')
    review = review.replace(';','')
    review = review.replace('_','')
    review = review.replace('-','')
    review = review.replace(',','')
    review = re.sub('\d', '', review)
    tokens = tokenizer.tokenize(review)
    new_tokens = [i for i in tokens if i not in en_stopwords]
    stemmed_tokens = [ps.stem(i) for i in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [76]:
preprocessed_reviews = [getCleanComment(i) for i in reviews]

In [77]:
data['preprocessed_reviews'] = preprocessed_reviews
data['textblob_score'] = data["preprocessed_reviews"].apply(textblob_score)
data['textblob_pred'] = pd.cut(data['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
data.head(n=10)

Unnamed: 0,reviewerID,asin,Title,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,preprocessed_reviews,textblob_score,textblob_pred
0,A15Q7ABIU9O9YZ,60554800,Red Tide,Larry Scantlebury,"[2,3]",This is my first GM Ford book and I will read ...,3,"Let's keep it real, not personal",1127606400,"09 25, 2005",first gm ford book read well written mysteryad...,0.030049,3
1,AUIJDXNYVTEA8,60554800,Red Tide,Les Stockton,"[0,2]",I liked the story. I thought the book added a...,4,I liked it,1361923200,"02 27, 2013",liked story thought book added bit much leftle...,0.6,4
2,A20N5GOON55TE9,60554800,Red Tide,lila,"[0,2]","As always, G.M. Ford does not disappoint. I st...",5,Good reading,1366761600,"04 24, 2013",alway g ford disappoint still lik leo waterman...,1.0,5
3,A1CT8ENDZSYTX3,60554800,Red Tide,Lisa B.,"[1,2]",I love Ford's Leo Waterman series and the firs...,3,Science Fiction or Mystery?,1122249600,"07 25, 2005",lov ford leo waterman seri first frank cors st...,0.05119,3
4,A2SI6BNK5SWSMD,60554800,Red Tide,L. J. Roberts,"[2,2]",It was nice to see Corso working with the poli...,3,3.5 stars - Needed a better end.,1113004800,"04 9, 2005",nic see cors working polic chang coupl good ch...,0.08,3
5,A1SSYYL2WTAK4Y,60554800,Red Tide,Mtnhi 5,"[0,0]",Bet you can't put this one down. Ford's writin...,5,Excellent,1397952000,"04 20, 2014",bet put one ford writing excellentexciting poi...,0.0,3
6,AVZ1LIW6EUQIK,60554800,Red Tide,"Robert J. Unger ""Dr. Gates""","[0,2]",I started with his first books and have now re...,5,All his books are great,1359849600,"02 3, 2013",started first book read everyon recommend star...,0.246667,4
7,A3QJQQZTKFV7BJ,60554800,Red Tide,"Sandy ""WR Gma""","[1,1]",I was worried through almost the first half of...,3,"I'm still a loyal reader, but this series seem...",1373241600,"07 8, 2013",worried almost first half book fearing ford de...,0.093981,3
8,A3OMPCA27U6WL2,60554800,Red Tide,Teri Tipton,"[2,2]",I really enjoy G.M. Ford's writing. I love hi...,2,I'm...Trying....To.....Finish......It.....,1169251200,"01 20, 2007",really enjoy g ford writing lov charact lov se...,0.11798,3
9,A31WHFXF6T06DR,60554800,Red Tide,"Watson McFestus ""Watson McFestus""","[0,2]",Sort of a medical thriller - which is a type o...,4,Pretty good 3.5 stars,1360713600,"02 13, 2013",sort medic thrill typ thrill usually lik cors ...,0.2875,4


### Using Vader

In [78]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


In [80]:
data['vader_score'] = data["preprocessed_reviews"].apply(lambda Description: sid.polarity_scores(Description))
data.head()

Unnamed: 0,reviewerID,asin,Title,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,preprocessed_reviews,textblob_score,textblob_pred,vader_score
0,A15Q7ABIU9O9YZ,60554800,Red Tide,Larry Scantlebury,"[2,3]",This is my first GM Ford book and I will read ...,3,"Let's keep it real, not personal",1127606400,"09 25, 2005",first gm ford book read well written mysteryad...,0.030049,3,"{'neg': 0.134, 'neu': 0.71, 'pos': 0.156, 'com..."
1,AUIJDXNYVTEA8,60554800,Red Tide,Les Stockton,"[0,2]",I liked the story. I thought the book added a...,4,I liked it,1361923200,"02 27, 2013",liked story thought book added bit much leftle...,0.6,4,"{'neg': 0.0, 'neu': 0.592, 'pos': 0.408, 'comp..."
2,A20N5GOON55TE9,60554800,Red Tide,lila,"[0,2]","As always, G.M. Ford does not disappoint. I st...",5,Good reading,1366761600,"04 24, 2013",alway g ford disappoint still lik leo waterman...,1.0,5,"{'neg': 0.151, 'neu': 0.615, 'pos': 0.235, 'co..."
3,A1CT8ENDZSYTX3,60554800,Red Tide,Lisa B.,"[1,2]",I love Ford's Leo Waterman series and the firs...,3,Science Fiction or Mystery?,1122249600,"07 25, 2005",lov ford leo waterman seri first frank cors st...,0.05119,3,"{'neg': 0.057, 'neu': 0.876, 'pos': 0.066, 'co..."
4,A2SI6BNK5SWSMD,60554800,Red Tide,L. J. Roberts,"[2,2]",It was nice to see Corso working with the poli...,3,3.5 stars - Needed a better end.,1113004800,"04 9, 2005",nic see cors working polic chang coupl good ch...,0.08,3,"{'neg': 0.085, 'neu': 0.769, 'pos': 0.146, 'co..."


In [83]:
data['compound'] = data['vader_score'].apply(lambda score_dict: score_dict['compound'])
data['vader_pred'] = pd.cut(data['compound'], bins=5, labels=[1, 2, 3, 4, 5])
data.tail(n=10)

Unnamed: 0,reviewerID,asin,Title,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,preprocessed_reviews,textblob_score,textblob_pred,vader_score,compound,vader_pred
7426,ALAAIPGIFF98F,60582006,"A Good Yarn (Blossom Street, No. 2)",LH,"[1,2]","If you are a knitter, you will enjoy reading t...",4,"An easy, feel good read",1163030400,"11 9, 2006",knitt enjoy reading light story knitting shop ...,0.4,4,"{'neg': 0.0, 'neu': 0.758, 'pos': 0.242, 'comp...",0.4939,4
7427,AH7CBIWDTHXWA,60582006,"A Good Yarn (Blossom Street, No. 2)",Lois Lain,"[3,3]",I have to admit I feel a little nerdy reading ...,4,A Good Book,1129334400,"10 15, 2005",admit feel littl nerdy reading novel set yarn ...,0.14,3,"{'neg': 0.064, 'neu': 0.756, 'pos': 0.18, 'com...",0.7184,5
7428,A3MLDUWII68NKB,60582006,"A Good Yarn (Blossom Street, No. 2)",Lorraine Janice Lee,"[0,0]","Never thought I'd see this. Good story, but I ...",4,Books about knitting,1383264000,"11 1, 2013",nev thought see good story read shop blossom s...,0.611111,5,"{'neg': 0.0, 'neu': 0.613, 'pos': 0.387, 'comp...",0.9001,5
7429,A3GWM1FHA3EQPM,60582006,"A Good Yarn (Blossom Street, No. 2)",Marianne O. Schmidt,"[8,8]","I love all of Debbie's books - they are sweet,...",5,Debbie Does It Again,1130630400,"10 30, 2005",lov debbi book sweet almost alway happy ending...,0.432386,4,"{'neg': 0.013, 'neu': 0.665, 'pos': 0.322, 'co...",0.998,5
7430,A1XJNL666U1E16,60582006,"A Good Yarn (Blossom Street, No. 2)",Melissa,"[0,0]",Always great to read Debbie! She lets you esc...,5,Another feel good story!,1364860800,"04 2, 2013",alway great read debbi let escap nuanc lif ano...,0.4,4,"{'neg': 0.0, 'neu': 0.709, 'pos': 0.291, 'comp...",0.8481,5
7431,A27D52RUOADQVR,60582006,"A Good Yarn (Blossom Street, No. 2)",Mikew,"[0,0]",Love getting to know each of their lives. She ...,5,Heartwarming,1362268800,"03 3, 2013",lov getting know liv creat charact mind help b...,0.0,3,"{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'comp...",0.4019,4
7432,A93B8XMH93D1R,60582006,"A Good Yarn (Blossom Street, No. 2)","miss demeaner ""BOOK SMARTS""","[5,13]","UNLIKE SOME OF REVIEWERS ON HERE, I WON'T END ...",2,A LONG YARN,1149811200,"06 9, 2006",unlik review end typing ent chapt opinion get ...,0.321667,4,"{'neg': 0.049, 'neu': 0.694, 'pos': 0.257, 'co...",0.9682,5
7433,A2JEMVYOM5IFU0,60582006,"A Good Yarn (Blossom Street, No. 2)",Norah E. Sempelsz,"[0,0]","I could not put this book down, I was so mired...",5,Warm and Inviting Yarn Shop,1203811200,"02 24, 2008",could put book mired liv peopl visit blossom s...,0.125,3,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",0.6908,5
7434,AX64AMAD1RTMI,60582006,"A Good Yarn (Blossom Street, No. 2)",Robin Wren,"[0,0]","I have read Debbie Macomber's ""The Shop on Blo...",4,Just a warm and touching Story,1186444800,"08 7, 2007",read debbi macomb shop blossom street thought ...,0.421157,4,"{'neg': 0.04, 'neu': 0.661, 'pos': 0.299, 'com...",0.9493,5
7435,A1Y0Y74B0POAA6,60582006,"A Good Yarn (Blossom Street, No. 2)",Sharon Galligar Chance,"[54,57]",There's something so soothing about knitting -...,5,A treasure! Very heartwarming!,1114732800,"04 29, 2005",something soothing knitting sweet clicking nee...,0.299351,4,"{'neg': 0.043, 'neu': 0.728, 'pos': 0.229, 'co...",0.9982,5
