In [362]:
# imports and setup
import pandas as pd
import nltk
import stanza
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from operator import itemgetter
import time
from tqdm import tqdm

pattern = r'[^A-Za-z0-9]+'

In [None]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

In [5]:
reviews = pd.read_csv('data/Books_rating.csv')

In [87]:
sample = 'The book is too long, but the characters were good. Who knew Darth Vader was such a caring father?'
# sample = 'The book is too long, but the characters were good.'

In [250]:
# check first n rows of review text
reviews[['review/text']].iloc[:500]

Unnamed: 0,review/text
0,This is only for Julie Strain fans. It's a col...
1,I don't care much for Dr. Seuss but after read...
2,"If people become the books they read and if ""t..."
3,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Philip Nel - Dr. Seuss: American IconThis is b...
...,...
495,I'm writng again to say that this book has so ...
496,This book is exciting It's a turn pager you ca...
497,"Ash. Ahh. Such a great guy. Mary-Lynette, she'..."
498,We've fallen head over heels in love with Ash!...


In [None]:
# set number of rows:
n = 100

In [317]:
# lowercase and tokenise
data = reviews[['review/text']].iloc[:n].apply(lambda x: x.astype(str).str.lower())
data = data['review/text'].apply(nltk.sent_tokenize)
data.head()
# text = reviews
# text = text.lower()
# text = nltk.sent_tokenize(text)
# text

0    [this is only for julie strain fans., it's a c...
1    [i don't care much for dr. seuss but after rea...
2    [if people become the books they read and if "...
3    [theodore seuss geisel (1904-1991), aka &quot;...
4    [philip nel - dr. seuss: american iconthis is ...
Name: review/text, dtype: object

In [366]:
test_data = data[55]
for sent in test_data:
    print(re.sub(pattern, ' ', sent))

tells the wonderful story of how st hyacinth and his fellow dominicans planted the holy catholic faith in poland lithuania russia and all over northern europe 
many were the remarkable events in this saint s life including the raising of the dead 
for children ages 10 and up 
17 illustrations 
189pp 
pb 
imprimatur 


In [303]:
# score sentiment 
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            if score['compound'] != 0.0:
                pair_score = [pair, score['compound']]
                scores.append(pair_score)
    return(scores)

In [370]:
def clean_sentence(sentence):
    clean_sentence = re.sub(pattern, ' ', sentence)
    token_clean = nltk.word_tokenize(clean_sentence)
    pos_clean = nltk.pos_tag(token_clean)
    return(pos_clean, clean_sentence, token_clean)

In [374]:
def aspect_sentiment(data):
    totalfeatureList = []
    cnt = 0
    for review in tqdm(data):
        sent_cnt = 0
        sentence_scores = []
        for sent in review:
            try:
                clean_sentence(sent)
                sent = re.sub(pattern, ' ', sent)
                # print(sent_cnt, sent)
                # clean
                sentence_pos, sentence_clean, sentence_token = clean_sentence(sent)

                if len(sentence_clean.strip()) == 0:
                    continue
                else:
                    # relationship parser
                    doc = nlp(sent)
                    dep_node = []
                    if doc.sentences[0].dependencies:
                        for dep in doc.sentences[0].dependencies:
                            dep_node.append([dep[2].text, dep[0].id, dep[1]])
                        for i in range(0, len(dep_node)):
                            if (int(dep_node[i][1]) != 0):
                                dep_node[i][1] = sentence_token[(int(dep_node[i][1]) - 1)]

                        # possible features
                        featureList = []
                        categories = []
                        for i in sentence_pos:
                            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                                featureList.append(list(i))
                                totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                                categories.append(i[0])

                        # cluster together features and descriptors
                        fcluster = []
                        for i in featureList:
                            filist = []
                            for j in dep_node:
                                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                                    if(j[0]==i[0]):
                                        filist.append(j[1])
                                    else:
                                        filist.append(j[0])
                            fcluster.append([i[0], filist])

                        # select only nouns
                        finalcluster = []
                        dic = {}
                        for i in featureList:
                            dic[i[0]] = i[1]
                        for i in fcluster:
                            if(dic[i[0]]=="NN"):
                                finalcluster.append(i)

                        # get sentence scores  
                        sentence_sentiment = sentiment_score(finalcluster) 
                        
                        for score in sentence_sentiment:
                            sentence_scores.append(score)
            except Exception as e:
                print('review #:', cnt, ' -- skipping sentence', sent_cnt)
                continue
            sent_cnt += 1

        sentence_scores = sorted(sentence_scores, key=itemgetter(1), reverse = True)
        # print('review #:', cnt, sentence_scores)
        cnt += 1
    return(sentence_scores)




In [375]:
# run code
print(aspect_sentiment(data))


  0%|          | 1/500 [00:01<09:20,  1.12s/it]

review #: 0 [[['section', ['nice']], 0.4215], [['book', ['want']], 0.0772], [['julie', ['strain']], -0.0516]]


  0%|          | 2/500 [00:04<20:57,  2.53s/it]

review #: 1 [[['book', ['great']], 0.6249], [['work', ['recommend']], 0.3612], [['rel', ['plays']], 0.25], [['poet', ['serious']], -0.0772]]


  1%|          | 3/500 [00:08<27:45,  3.35s/it]

review #: 2 [[['doctor', ['good']], 0.4404], [['daddy', ['treat']], 0.4019], [['treatment', ['serious']], -0.0772]]


  1%|          | 4/500 [00:16<42:53,  5.19s/it]

review #: 3 [[['lifelong', ['thrill']], 0.3612], [['nel', ['recommends']], 0.2263], [['lorax', ['protest']], -0.25], [['semitism', ['anti']], -0.3182]]


  1%|          | 5/500 [00:21<39:26,  4.78s/it]

review #: 4 [[['background', ['enjoy']], 0.4939]]


  1%|          | 6/500 [00:25<38:21,  4.66s/it]

review #: 5 [[['everyone', ['interested']], 0.4019], [['book', ['thoughtful']], 0.3818], [['i', ['recommend']], 0.3612], [['book', ['recommend']], 0.3612], [['disneyification', ['threatens']], -0.3818]]


  1%|▏         | 7/500 [00:26<29:09,  3.55s/it]

review #: 6 [[['giesel', ['created']], 0.25], [['philip', ['argues']], -0.3818]]


  1%|▏         | 7/500 [00:27<32:42,  3.98s/it]


KeyboardInterrupt: 