In [65]:
# imports and setup
import pandas as pd
# import numpy as np
import nltk
import stanza
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from operator import itemgetter
import time
from tqdm import tqdm

pattern = r'[^A-Za-z0-9]+'

In [None]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

In [None]:
# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

In [5]:
reviews = pd.read_csv('data/Books_rating.csv')

In [None]:
# check first n rows of review text
reviews[['review/text']].iloc[:500]

In [11]:
# set number of rows:
n = 10000

In [12]:
# lowercase and tokenise
data = reviews[['review/text']].iloc[:n].apply(lambda x: x.astype(str).str.lower())
sentence_tokenized = data['review/text'].apply(nltk.sent_tokenize)

In [15]:
def clean_sentence(sentence):
    clean_sentence = re.sub(pattern, ' ', sentence)
    token_clean = nltk.word_tokenize(clean_sentence)
    pos_clean = nltk.pos_tag(token_clean)
    return(pos_clean, clean_sentence, token_clean)

In [13]:
review_list = []

for review in sentence_tokenized:
    sentence_clean = []
    sentence_pos = []
    sentence_token = []
    for sentence in review:
        pos, clean, token = clean_sentence(sentence)
        sentence_pos.append(pos)
        sentence_clean.append(clean)
        sentence_token.append(token)
    review_dict = {"sentence": sentence_clean, "token": sentence_token, "pos": sentence_pos}
    review_list.append(review_dict)

In [99]:
tokenized_data = pd.DataFrame(review_list)   
tokenized_data['scores'] = None
tokenized_data.head()

Unnamed: 0,sentence,token,pos,scores
0,"[this is only for julie strain fans , it s a c...","[[this, is, only, for, julie, strain, fans], [...","[[(this, DT), (is, VBZ), (only, RB), (for, IN)...",
1,[i don t care much for dr seuss but after read...,"[[i, don, t, care, much, for, dr, seuss, but, ...","[[(i, JJ), (don, VBP), (t, EX), (care, NN), (m...",
2,[if people become the books they read and if t...,"[[if, people, become, the, books, they, read, ...","[[(if, IN), (people, NNS), (become, VBP), (the...",
3,[theodore seuss geisel 1904 1991 aka quot dr s...,"[[theodore, seuss, geisel, 1904, 1991, aka, qu...","[[(theodore, RB), (seuss, JJ), (geisel, NN), (...",
4,[philip nel dr seuss american iconthis is basi...,"[[philip, nel, dr, seuss, american, iconthis, ...","[[(philip, NN), (nel, NNS), (dr, VBP), (seuss,...",


In [24]:
# score sentiment 
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            if score['compound'] != 0.0:
                pair_score = [pair, score['compound']]
                scores.append(pair_score)
    return(scores)

In [49]:
def find_relationships(doc, token, pos):
    sentence_scores = []
    if doc.sentences[0].dependencies:
        dep_node = []
        # print(dep_node)
        for dep in doc.sentences[0].dependencies:
            dep_node.append([dep[2].text, dep[0].id, dep[1]])
        for i in range(0, len(dep_node)):
            if (int(dep_node[i][1]) != 0):
                dep_node[i][1] = token[(int(dep_node[i][1]) - 1)]
                
        # possible features
        featureList = []
        for i in pos:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))

        # cluster together features and descriptors
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])

        # select only nouns
        finalcluster = []
        dic = {}
        for i in featureList:
            dic[i[0]] = i[1]
        for i in fcluster:
            if(dic[i[0]]=="NN"):
                finalcluster.append(i)

        # get sentence scores  
        sentence_sentiment = sentiment_score(finalcluster) 
        # for score in sentence_sentiment:
        #     sentence_scores.append(score)
    return(sentence_sentiment)

In [33]:
tokenized_data['sentence'].loc[0][0]

'this is only for julie strain fans '

In [102]:
# loop through data
review_scores_list = []

#reviews
for i in range(0, n - 1):
    # sentences
    review_scores = []
    for j in range(0, len(tokenized_data['sentence'].loc[i]) - 1):
        sentence_scores = []
        sentence = tokenized_data['sentence'].loc[i][j]
        pos = tokenized_data['pos'].loc[i][j]
        token = tokenized_data['token'].loc[i][j] 
        if len(sentence.strip()) == 0:
            continue
        else:
            # print(sentence)
            doc = nlp(sentence)
            scores = find_relationships(doc, token, pos)
            # print(scores)
            if len(scores) != 0:
                review_scores.append(scores)
        tokenized_data['scores'].iloc[i] = review_scores 
    break
    

  arr_value = np.asarray(value)


In [103]:
tokenized_data['scores'].iloc[:2]

0                 [[[['julie', ['strain']], -0.0516]]]
1    [[[['rel', ['plays']], 0.25], [['poet', ['seri...
Name: scores, dtype: object

In [87]:
review_scores

[[[['rel', ['plays']], 0.25], [['poet', ['serious']], -0.0772]]]

In [74]:
def aspect_sentiment(data):
    totalfeatureList = []
    cnt = 0
    for review in tqdm(data):
        sent_cnt = 0
        sentence_scores = []
        if sent_cnt > 1:
            continue
        for sent in review:
            try:
                clean_sentence(sent)
                sent = re.sub(pattern, ' ', sent)
                # print(sent_cnt, sent)
                # clean sentence
                sentence_pos, sentence_clean, sentence_token = clean_sentence(sent)

                if len(sentence_clean.strip()) == 0:
                    continue
                else:
                    # relationship parser
                    doc = nlp(sentence_clean)
                    dep_node = []

                    if doc.sentences[0].dependencies:
                        for dep in doc.sentences[0].dependencies:
                            dep_node.append([dep[2].text, dep[0].id, dep[1]])
                        for i in range(0, len(dep_node)):
                            if (int(dep_node[i][1]) != 0):
                                dep_node[i][1] = sentence_token[(int(dep_node[i][1]) - 1)]

                        # possible features
                        featureList = []
                        categories = []
                        for i in sentence_pos:
                            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                                featureList.append(list(i))
                                # totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                                # categories.append(i[0])

                        # cluster together features and descriptors
                        fcluster = []
                        for i in featureList:
                            filist = []
                            for j in dep_node:
                                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                                    if(j[0]==i[0]):
                                        filist.append(j[1])
                                    else:
                                        filist.append(j[0])
                            fcluster.append([i[0], filist])

                        # select only nouns
                        finalcluster = []
                        dic = {}
                        for i in featureList:
                            dic[i[0]] = i[1]
                        for i in fcluster:
                            if(dic[i[0]]=="NN"):
                                finalcluster.append(i)

                        # get sentence scores  
                        sentence_sentiment = sentiment_score(finalcluster) 
                        
                        for score in sentence_sentiment:
                            sentence_scores.append(score)
            except Exception as e:
                print('review #:', cnt, ' -- skipping sentence', sent_cnt)
                continue

            

            sent_cnt += 1

        # sentence_scores = sorted(sentence_scores, key=itemgetter(1), reverse = True)
        # print('review #:', cnt, sentence_scores)
        # print('review #:', cnt, 'sentence time: ', round(time.time()-review_time, 5))
        cnt += 1
    return(sentence_scores)




In [36]:
# run code
absa = aspect_sentiment(data)
# absa.head()
#

 39%|███▉      | 39/100 [01:27<02:17,  2.25s/it]


KeyboardInterrupt: 

In [29]:
absa

[]