 <h2>Load SST-5 dataset</h2>

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

In [5]:
import os
os.chdir(os.getcwd())

In [8]:
import pytreebank
import sys

dataset = pytreebank.load_sst()

<h2>Convert tree data to tabular format</h2>

In [9]:
out_path = os.path.join(sys.path[0], 'sst_{}.txt')
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] + 1,
                item.to_labeled_lines()[0][1]
            ))

In [14]:
train = pd.read_csv('sst_train.txt', sep='\t',header=None, names=['sentiment_label', 'sentence'])
train.head()

Unnamed: 0,sentiment_label,sentence
0,__label__4,The Rock is destined to be the 21st Century 's...
1,__label__5,The gorgeously elaborate continuation of `` Th...
2,__label__4,Singer/composer Bryan Adams contributes a slew...
3,__label__3,You 'd think by now America would have had eno...
4,__label__4,Yet the act is still charming here .


<h2>Train model</h2>

In [15]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity

In [16]:
train['textblob_score'] = train['sentence'].apply(textblob_score)
train['textblob_pred'] = pd.cut(train['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
train = train.drop('textblob_score', axis=1)
train.head()

Unnamed: 0,sentiment_label,sentence,textblob_pred
0,__label__4,The Rock is destined to be the 21st Century 's...,4
1,__label__5,The gorgeously elaborate continuation of `` Th...,4
2,__label__4,Singer/composer Bryan Adams contributes a slew...,3
3,__label__3,You 'd think by now America would have had eno...,3
4,__label__4,Yet the act is still charming here .,5


In [30]:
import re
data = pd.read_csv("Amazon Book Reviews.csv", usecols = [0,1,5])

In [31]:
data.head()

Unnamed: 0,reviewerID,asin,reviewText
0,A15Q7ABIU9O9YZ,60554800,This is my first GM Ford book and I will read ...
1,AUIJDXNYVTEA8,60554800,I liked the story. I thought the book added a...
2,A20N5GOON55TE9,60554800,"As always, G.M. Ford does not disappoint. I st..."
3,A1CT8ENDZSYTX3,60554800,I love Ford's Leo Waterman series and the firs...
4,A2SI6BNK5SWSMD,60554800,It was nice to see Corso working with the poli...


In [44]:
reviews = list(data['reviewText'])

<h3>Preprocess Amazon reviews for testing</h3>

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [41]:
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
en_stopwords = set(stopwords.words('english'))
ps = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VAIBHAV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
def getCleanComment(review):
    review = str(review)
    review = review.lower()
    review = review.replace('"','')
    review = review.replace(';','')
    review = review.replace('_','')
    review = review.replace('-','')
    review = review.replace(',','')
    review = re.sub('\d', '', review)
    tokens = tokenizer.tokenize(review)
    new_tokens = [i for i in tokens if i not in en_stopwords]
    stemmed_tokens = [ps.stem(i) for i in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [45]:
preprocessed_reviews = [getCleanComment(i) for i in reviews]

<h2>Predicting sentiment score</h2>

In [46]:
data['preprocessed_reviews'] = preprocessed_reviews
data['textblob_score'] = data["preprocessed_reviews"].apply(textblob_score)
data['textblob_pred'] = pd.cut(data['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
data.head()

Unnamed: 0,reviewerID,asin,reviewText,preprocessed_reviews,textblob_score,textblob_pred
0,A15Q7ABIU9O9YZ,60554800,This is my first GM Ford book and I will read ...,first gm ford book read well written mysteryad...,0.007964,3
1,AUIJDXNYVTEA8,60554800,I liked the story. I thought the book added a...,liked story thought book added bit much leftle...,0.6,4
2,A20N5GOON55TE9,60554800,"As always, G.M. Ford does not disappoint. I st...",alway g ford disappoint still lik leo waterman...,1.0,5
3,A1CT8ENDZSYTX3,60554800,I love Fords Leo Waterman series and the first...,lov ford leo waterman seri first frank cors st...,0.05119,3
4,A2SI6BNK5SWSMD,60554800,It was nice to see Corso working with the poli...,nic see cors working polic chang coupl good ch...,0.08,3
