In [1]:
# imports and setup
import pandas as pd
import numpy as np
import nltk
import stanza
import re
import os
# from nltk.corpus import stopwords, wordnet
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from operator import itemgetter
import time
from tqdm import tqdm
import matplotlib.pyplot as plt

pattern = r'[^A-Za-z0-9]+'

In [2]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-07 22:14:22 INFO: Downloading default packages for language: en (English) ...
2022-11-07 22:14:23 INFO: File exists: C:\Users\krish\stanza_resources\en\default.zip
2022-11-07 22:14:27 INFO: Finished downloading models and saved to C:\Users\krish\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

2022-11-07 22:14:36 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-07 22:14:46 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-11-07 22:14:46 INFO: Use device: cpu
2022-11-07 22:14:46 INFO: Loading: tokenize
2022-11-07 22:14:46 INFO: Loading: pos
2022-11-07 22:14:46 INFO: Loading: lemma
2022-11-07 22:14:46 INFO: Loading: depparse
2022-11-07 22:14:47 INFO: Loading: sentiment
2022-11-07 22:14:47 INFO: Loading: constituency
2022-11-07 22:14:47 INFO: Loading: ner
2022-11-07 22:14:48 INFO: Done loading processors!


In [4]:
# define some nice plot colours
colors = ['#142459', '#176BA0', '#19AADE', '#1AC9E6', '#1DE4BD', '#60F0D2', '#c7F9EE']

In [5]:
reviews = pd.read_csv('data/Books_rating.csv')

KeyboardInterrupt: 

In [None]:
# check first n rows of review text
reviews[['review/text', 'review/score']].iloc[:10]

Unnamed: 0,review/text,review/score
0,This is only for Julie Strain fans. It's a col...,4.0
1,I don't care much for Dr. Seuss but after read...,5.0
2,"If people become the books they read and if ""t...",5.0
3,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",4.0
4,Philip Nel - Dr. Seuss: American IconThis is b...,4.0
5,"""Dr. Seuss: American Icon"" by Philip Nel is a ...",4.0
6,Theodor Seuss Giesel was best known as 'Dr. Se...,5.0
7,When I recieved this book as a gift for Christ...,5.0
8,Trams (or any public transport) are not usuall...,5.0
9,"As far as I am aware, this is the first book-l...",4.0


In [None]:
# set number of rows:
n = 100
aspects = 8

In [None]:
# lowercase and tokenise
data = reviews[['review/text', 'review/score']].iloc[:n].apply(lambda x: x.astype(str).str.lower())
sentence_tokenized = data['review/text'].apply(nltk.sent_tokenize)

In [None]:
def clean_sentence(sentence):
    clean_sentence = re.sub(pattern, ' ', sentence)
    token_clean = nltk.word_tokenize(clean_sentence)
    pos_clean = nltk.pos_tag(token_clean)
    return(pos_clean, clean_sentence, token_clean)

In [None]:
# tokenize data
review_list = []

for review in sentence_tokenized:
    sentence_clean = []
    sentence_pos = []
    sentence_token = []
    for sentence in review:
        pos, clean, token = clean_sentence(sentence)
        sentence_pos.append(pos)
        sentence_clean.append(clean)
        sentence_token.append(token)
    review_dict = {"sentence": sentence_clean, "token": sentence_token, "pos": sentence_pos}
    review_list.append(review_dict)

In [None]:
# put tokenized data in dataframe
tokenized_data = pd.DataFrame(review_list)   
tokenized_data['scores'] = None
tokenized_data['duration'] = 0
tokenized_data = pd.concat([tokenized_data, data['review/score']], axis=1, join='inner')
tokenized_data.head()

Unnamed: 0,sentence,token,pos,scores,duration,review/score
0,"[this is only for julie strain fans , it s a c...","[[this, is, only, for, julie, strain, fans], [...","[[(this, DT), (is, VBZ), (only, RB), (for, IN)...",,0,4.0
1,[i don t care much for dr seuss but after read...,"[[i, don, t, care, much, for, dr, seuss, but, ...","[[(i, JJ), (don, VBP), (t, EX), (care, NN), (m...",,0,5.0
2,[if people become the books they read and if t...,"[[if, people, become, the, books, they, read, ...","[[(if, IN), (people, NNS), (become, VBP), (the...",,0,5.0
3,[theodore seuss geisel 1904 1991 aka quot dr s...,"[[theodore, seuss, geisel, 1904, 1991, aka, qu...","[[(theodore, RB), (seuss, JJ), (geisel, NN), (...",,0,4.0
4,[philip nel dr seuss american iconthis is basi...,"[[philip, nel, dr, seuss, american, iconthis, ...","[[(philip, NN), (nel, NNS), (dr, VBP), (seuss,...",,0,4.0


In [None]:
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            if score['compound'] != 0.0:
                pair_score = [pair, score['compound']]
                scores.append(pair_score)
    return(scores)

In [None]:
def find_relationships(doc, token, pos):
    # categories = []
    if doc.sentences[0].dependencies:
        dep_node = []
        # print(dep_node)
        for dep in doc.sentences[0].dependencies:
            dep_node.append([dep[2].text, dep[0].id, dep[1]])
        for i in range(0, len(dep_node)):
            if (int(dep_node[i][1]) != 0):
                dep_node[i][1] = token[(int(dep_node[i][1]) - 1)]
                
        # possible features
        featureList = []
        for i in pos:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))

        # cluster together features and descriptors
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])

        # select only nouns
        finalcluster = []
        dic = {}
        for i in featureList:
            dic[i[0]] = i[1]
        for i in fcluster:
            if(dic[i[0]]=="NN"):
                finalcluster.append(i)

        # get sentence scores  
        sentence_sentiment = sentiment_score(finalcluster) 
        # for score in sentence_sentiment:
        #     sentence_scores.append(score)
    return(sentence_sentiment)

In [None]:
def get_aspects(scores):
    aspects = []
    for x in scores:
        for y in x:
            aspects.append([y[0][0], y[0][1][0], y[1]])
    return(aspects)

In [None]:
def absa(tokenized_data, aspects = aspects, n = n):
    # loop through data
    #reviews
    for i in tqdm(range(0, n)):
        start_review = time.time()
        current_aspects = 0
        # sentences
        review_scores = []
        for j in range(0, len(tokenized_data['sentence'].loc[i]) - 1):
            current_aspects = len(review_scores)
            if current_aspects >= aspects:
                continue
            sentence = tokenized_data['sentence'].loc[i][j]
            pos = tokenized_data['pos'].loc[i][j]
            token = tokenized_data['token'].loc[i][j] 
            if len(sentence.strip()) == 0:
                continue
            else:
                # print(sentence)
                doc = nlp(sentence)
                try:
                    scores = find_relationships(doc, token, pos)
                    # print(scores)
                    if len(scores) != 0:
                        review_scores.append(scores)
                except:
                    continue
            # limit number of aspects processed
        tokenized_data['scores'].iloc[i] = review_scores[0:aspects]     
        duration = time.time() - start_review
        tokenized_data['duration'].loc[i] = duration  
        # break
    return(tokenized_data.iloc[:n])

In [None]:
def predict(scores):
    prediction = None
    prediction_sum = 0
    #scores = list(output['scores'].loc[1]
    for x in scores:
        prediction_sum += float(x[2])
        if prediction_sum > 0:
            prediction = 'Positive'
        elif prediction_sum < 0:
            prediction = 'Negative'
        else:
            prediction = 'Neutral'
    return(prediction)

In [20]:
def measure(output, n, aspects):
    #format and measure output and print to csv
    try:
        output['review/score'] = output['review/score'].apply(lambda x: int(x.replace('.0', '')))
    except:
        print('already converted to int!')
    output['label'] = np.where(output['review/score'] >= 3, 'Positive', 'Negative')
    output['label'] = np.where(output['review/score'] == 3, 'Neutral', output['label'])
    try:
        output['scores'] = output['scores'].apply(get_aspects)
    except:
        print('aspects already found!')
    output['prediction'] = output['scores'].apply(predict)
    output['correct'] = np.where(output['label'] == output['prediction'], 1, 0)
    output.to_csv('output/output(n' + str(n) + '_a' + str(aspects) + ').csv')
    return(output)

In [1]:
for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]:
    a = i
    print('n =', n, 'a =', a)
    output = absa(tokenized_data = tokenized_data, aspects = aspects, n = n)
    measure(output, n, a)
print('done!')

NameError: name 'n' is not defined

In [None]:
# read output into comparitive time_series
time_series = pd.DataFrame(columns = ['a=1', 'a=2', 'a=3', 'a=4', 'a=5', 'a=6', 'a=7', 'a=8', 'a=9', 'a=10', 'a=15', 'a=20'])
# print(time_series)
a = 1
for filename in os.listdir('output/'):
    output_file = pd.read_csv('output/' + filename)
    series = output_file['duration'].cumsum()
    column = 'a=' + str(a)
    time_series[column] = series
    # time_series.append(series.values)
    a+=1



In [None]:
time_series

In [None]:
# Time series of daily reviews
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(time_series)
ax.set_ylabel('Duration')
ax.set_xlabel('Percentage of dataset processed')
ax.set_title('Duration for different number of aspects')
# print('Max reviews per day: ', max(daily_reviews))