In [1]:
# imports and setup
import pandas as pd
import nltk
import stanza
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from operator import itemgetter
import time
from tqdm import tqdm

pattern = r'[^A-Za-z0-9]+'

In [2]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-06 20:29:24 INFO: Downloading default packages for language: en (English) ...
2022-11-06 20:29:25 INFO: File exists: C:\Users\krish\stanza_resources\en\default.zip
2022-11-06 20:29:29 INFO: Finished downloading models and saved to C:\Users\krish\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

2022-11-06 20:29:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-06 20:29:31 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-11-06 20:29:31 INFO: Use device: cpu
2022-11-06 20:29:31 INFO: Loading: tokenize
2022-11-06 20:29:31 INFO: Loading: pos
2022-11-06 20:29:31 INFO: Loading: lemma
2022-11-06 20:29:31 INFO: Loading: depparse
2022-11-06 20:29:31 INFO: Loading: sentiment
2022-11-06 20:29:31 INFO: Loading: constituency
2022-11-06 20:29:32 INFO: Loading: ner


KeyboardInterrupt: 

In [None]:
reviews = pd.read_csv('data/Books_rating.csv')

In [None]:
sample = 'The book is too long, but the characters were good. Who knew Darth Vader was such a caring father?'
# sample = 'The book is too long, but the characters were good.'

In [None]:
# check first n rows of review text
reviews[['review/text']].iloc[:500]

In [None]:
# set number of rows:
n = 100

In [None]:
# lowercase and tokenise
data = reviews[['review/text']].iloc[:n].apply(lambda x: x.astype(str).str.lower())
data = data['review/text'].apply(nltk.sent_tokenize)
# text

0    [this is only for julie strain fans., it's a c...
1    [i don't care much for dr. seuss but after rea...
2    [if people become the books they read and if "...
3    [theodore seuss geisel (1904-1991), aka &quot;...
4    [philip nel - dr. seuss: american iconthis is ...
Name: review/text, dtype: object

In [None]:
data.head()

0    [this is only for julie strain fans., it's a c...
1    [i don't care much for dr. seuss but after rea...
2    [if people become the books they read and if "...
3    [theodore seuss geisel (1904-1991), aka &quot;...
4    [philip nel - dr. seuss: american iconthis is ...
Name: review/text, dtype: object

In [None]:
test_data = data[0]
for sent in test_data:
    print(re.sub(pattern, ' ', sent))

this is only for julie strain fans 
it s a collection of her photos about 80 pages worth with a nice section of paintings by olivia if you re looking for heavy literary content this isn t the place to find it there s only about 2 pages with text and everything else is photos bottom line if you only want one book the six foot one is probably a better choice however if you like julie like i like julie you won t go wrong on this one either 


In [None]:
# score sentiment 
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            if score['compound'] != 0.0:
                pair_score = [pair, score['compound']]
                scores.append(pair_score)
    return(scores)

In [None]:
def clean_sentence(sentence):
    clean_time = time.time()
    clean_sentence = re.sub(pattern, ' ', sentence)
    token_clean = nltk.word_tokenize(clean_sentence)
    pos_clean = nltk.pos_tag(token_clean)
    # print(round(time.time() - clean_time, 5))
    return(pos_clean, clean_sentence, token_clean)

In [35]:
def aspect_sentiment(data):
    totalfeatureList = []
    cnt = 0
    for review in tqdm(data):
        sent_cnt = 0
        sentence_scores = []
        if sent_cnt > 1:
            continue
        for sent in review:
            try:
                clean_sentence(sent)
                sent = re.sub(pattern, ' ', sent)
                # print(sent_cnt, sent)
                # clean sentence
                sentence_pos, sentence_clean, sentence_token = clean_sentence(sent)

                if len(sentence_clean.strip()) == 0:
                    continue
                else:
                    # relationship parser
                    doc = nlp(sentence_clean)
                    dep_node = []

            #         if doc.sentences[0].dependencies:
            #             for dep in doc.sentences[0].dependencies:
            #                 dep_node.append([dep[2].text, dep[0].id, dep[1]])
            #             for i in range(0, len(dep_node)):
            #                 if (int(dep_node[i][1]) != 0):
            #                     dep_node[i][1] = sentence_token[(int(dep_node[i][1]) - 1)]

            #             # possible features
            #             featureList = []
            #             categories = []
            #             for i in sentence_pos:
            #                 if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
            #                     featureList.append(list(i))
            #                     totalfeatureList.append(list(i)) # This list will store all the features for every sentence
            #                     categories.append(i[0])

            #             # cluster together features and descriptors
            #             fcluster = []
            #             for i in featureList:
            #                 filist = []
            #                 for j in dep_node:
            #                     if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
            #                         if(j[0]==i[0]):
            #                             filist.append(j[1])
            #                         else:
            #                             filist.append(j[0])
            #                 fcluster.append([i[0], filist])

            #             # select only nouns
            #             finalcluster = []
            #             dic = {}
            #             for i in featureList:
            #                 dic[i[0]] = i[1]
            #             for i in fcluster:
            #                 if(dic[i[0]]=="NN"):
            #                     finalcluster.append(i)

            #             # get sentence scores  
            #             sentence_sentiment = sentiment_score(finalcluster) 
                        
            #             for score in sentence_sentiment:
            #                 sentence_scores.append(score)
            except Exception as e:
                print('review #:', cnt, ' -- skipping sentence', sent_cnt)
                continue

            

            sent_cnt += 1

        # sentence_scores = sorted(sentence_scores, key=itemgetter(1), reverse = True)
        # print('review #:', cnt, sentence_scores)
        # print('review #:', cnt, 'sentence time: ', round(time.time()-review_time, 5))
        cnt += 1
    return(sentence_scores)




In [36]:
# run code
absa = aspect_sentiment(data)
# absa.head()
#

 39%|███▉      | 39/100 [01:27<02:17,  2.25s/it]


KeyboardInterrupt: 

In [29]:
absa

[]