# Implementing Rule-based Model

In [180]:
import numpy as np
import pandas as pd
import textacy
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from spacy import displacy

- universal dependencies: https://universaldependencies.org/en/dep/index.html
- aspect-based opinion mining: https://devopedia.org/aspect-based-opinion-mining
- Apect extration GT: https://achyutjoshi.github.io/aspect_extraction/aspectextraction
    - https://github.com/ishikaarora/Aspect-Sentiment-Analysis-on-Amazon-Reviews/blob/master/src/models/aspect_extraction.py
- https://www.researchgate.net/publication/327707096_Aspect_Extraction_Performance_with_POS_Tag_Pattern_of_Dependency_Relation_in_Aspect-based_Sentiment_Analysis


In [181]:
lang_module = 'en_core_web_sm'
corpus = textacy.Corpus.load(lang=lang_module, filepath='./data/bank_reviews.bin.gz')

Notes:
- This approach may not be context aware. When two different products are beind discussed the Aspect/Modifier may not point to the correct product.
- Implicit vs. Explicit, our rule-based approach can only deal with explicit. Implicit aspect-opinion is missed
- Attention on aspects can be missed if not directly connected by dependency
- Punctuation changes meanings of A/M -> line 14 "Good bank, great service" vs. without comma. Co-operation split into two "co" and "operation"
- Very restrictive, can not keep up with the many ways people can write text

-> biggest concern how to evaluate un-labeled data

# Writting dependency rules to extract Aspects & Modifiers

In [182]:
# Start analyzer
analyzer = SentimentIntensityAnalyzer()

In [223]:
ITEM = 3
doc = corpus[ITEM]

product_references = ['it','this','they','these']


def add_compound(token, aspect):
    for child in token.children:
        # if child is compound then add as prefix
        if child.dep_ == 'compound' and aspect != '99999':
            aspect = child.norm_ + " " + aspect
        # if child clause modifier add to Aspect
        if child.dep_ == 'relcl':
            aspect = child.norm_ + " " + aspect
    return aspect



# for i, doc in enumerate(corpus[:40]):

def get_aspect_level_sentiments(doc:object):
    # print(doc._.meta['product'])
    # print(i,doc)

    ## FIRST RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    ## RULE = M is child of A with a relationship of amod
    rule_pairs = []

    for token in doc:
        A = "99999"               # aspect
        M = "99999"               # modifier
        if token.dep_ == "amod" and not token.is_stop:
            # if token.pos_ in ()
            M = token.norm_
            A = token.head.norm_
            if token.head.dep_ == 'dobj':
                # print(token.head.head)
                if token.head.head.dep_ == 'advcl':
                    # print(token.head.head.head)
                    M = token.head.head.head.norm_ + " " + token.head.head.norm_ + " " + M
                else:
                    M = token.head.head.norm_ + " " + M

            # advervial modifiers (most refreshing lotion)
            M_children = token.children
            for child_m in M_children:
                if (child_m.dep_ == "advmod"):
                    M_hash = child_m.norm_
                    M = M_hash + " " + M
                    break

            # negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
            A = add_compound(token.head, A)
            # A_children = token.head.children
            # for child_a in A_children:
            #     # if child is compound then add as prefix or suffix(try to determine rule for this)
            #     if child_a.dep_ == 'compound':
            #         A = child_a.norm_ + " " + A
                
            #     # if child clause modifier add to Aspect
            #     if child_a.dep_ == 'relcl':
            #         A = child_a.norm_ + " " + A

                # # add negative
                # if(child_a.dep_ == "det" and child_a.norm_ == 'no'):
                #     neg_prefix = 'not'
                #     M = neg_prefix + " " + M
                #     break

        if(A != "99999" and M != "99999"):
            rule_pairs.append((A, M, analyzer.polarity_scores(M + " " + A)['compound'],1))



    ## TWO RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    ## Adjectival Complement - A is a child of something with relationship of nsubj, while
    ## M is a child of the same something with relationship of acomp
    ## Assumption - A verb will have only one NSUBJ and DOBJ
    ## "The sound of the speakers would be better. The sound of the speakers could be better" - handled using AUX dependency
    for token in doc:

        children = token.children
        A = "99999"
        M = "99999"
        add_neg_pfx = False
        for child in children :
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.norm_
                # check_spelling(child.norm_)
                for child_two in child.children:
                    if child_two.dep_ == "compound":
                        A = child_two.norm_ + " " + A

            if(child.dep_ == "acomp" and not child.is_stop):
                children_two = child.children
                for child_two in children_two:
                    if child_two.dep_ == 'advmod':
                        M = child_two.norm_ + " " + child.norm_
                    else:
                        M = child.norm_

            # example - 'this could have been better' -> (this, not better)
            if(child.dep_ == "aux" and child.tag_ == "MD"):
                neg_prefix = "not"
                add_neg_pfx = True

            if(child.dep_ == "neg"):
                neg_prefix = child.norm_
                add_neg_pfx = True

        if (add_neg_pfx and M != "99999"):
            M = neg_prefix + " " + M
                #check_spelling(child.norm_)

        if(A != "99999" and M != "99999"):
            rule_pairs.append((A, M, analyzer.polarity_scores(M + " " + A)['compound'],2))


    ## THIRD RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    ## ATTR - link between a verb like 'be/seem/appear' and its complement
    ## Example: 'this is garbage' -> (this, garbage)
    for token in doc:
        children = token.children
        A = "99999"
        M = "99999"
        add_neg_pfx = False
        for child in children :
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.lemma_
                # check_spelling(child.norm_)

            if((child.dep_ == "attr") and not child.is_stop):
                M = child.lemma_
                #check_spelling(child.norm_)

            if(child.dep_ == "neg"):
                neg_prefix = child.norm_
                add_neg_pfx = True

        if (add_neg_pfx and M != "99999"):
            M = neg_prefix + " " + M

        if(A != "99999" and M != "99999"):
            rule_pairs.append((A, M,analyzer.polarity_scores(M + " " + A)['compound'],3))        


    ## FOURTH RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect

    #Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while
    # M is a child of the same something with relationship of advmod

    #Assumption - A verb will have only one NSUBJ and DOBJ
    for token in doc:


        children = token.children
        A = "99999"
        M = "99999"
        add_neg_pfx = False
        for child in children :
            if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
                A = child.norm_
                # check_spelling(child.norm_)

            if(child.dep_ == "advmod" and not child.is_stop):
                M = child.norm_
                M_children = child.children
                for child_m in M_children:
                    if(child_m.dep_ == "advmod"):
                        M_hash = child_m.norm_
                        M = M_hash + " " + child.norm_
                        break
                #check_spelling(child.norm_)

            if(child.dep_ == "neg"):
                neg_prefix = child.norm_
                add_neg_pfx = True

        if (add_neg_pfx and M != "99999"):
            M = neg_prefix + " " + M

        if(A != "99999" and M != "99999"):
            rule_pairs.append((A, M,analyzer.polarity_scores(M + " " + A)['compound'],4)) # )

    # FIFTH RULE 
    # A: Direct Object with NOUN type
    # M: Adj or Modifier of some sort
    # # Assumes that all direct objects with adjectives pointing may compose a A-M relationship
    for token in doc:
        A = "99999"               # aspect
        M = "99999"               # modifier
        # Adding aspect
        if token.dep_ == 'dobj' and token.pos_ == 'NOUN':
            A = token.norm_
        # Adding compound to aspect 
        A = add_compound(token=token, aspect=A)
        # Get modifier if matching dep and pos
        for child in token.children:
            if child.dep_ in ('advmod') and child.pos_ in ('ADV'):
                M = child.norm_

        if(A != "99999" and M != "99999"):
            rule_pairs.append((A, M, analyzer.polarity_scores(M + " " + A)['compound'],5))
    
    # Removing pairs that do not have sentiment
    rule_pairs = [(A,M,P,r) for (A,M,P,r) in rule_pairs if P != 0]
    # print(f'\t {rule_pairs}\n')
    return rule_pairs

# Example Row 0
# DONE -> Try adding case for CC "helped" (mod) build-credit (aspect) 

# Example Row 2
# DONE -> Neither should be captured [('credit card', 'new', 0.0, 1), ('future', 'near', 0.0, 1)] -> dismiss if 0 sentiment?

# Row 9
# DONE -> Misses cash back is a bonus 


# TODO:
# 1. IMPROVE ALGO
# Capture negatives correctly - line 30
# Row 10
# M:help, A:credit back-up
# Example Row 3
# Missed 0 percent interest, good for balance -> A: select obj type (add compound if present) M: head -> get adj 

results = list()
for i, doc in enumerate(corpus):
    aspects = get_aspect_level_sentiments(doc)
    results.append(aspects)
len(results)

23287

In [71]:
analyzer.polarity_scores('helped')['compound']

0.0

In [214]:
# Visualize an example of dependency
ITEM = 31
displacy.render(corpus[ITEM], style='dep', jupyter=True, options={'distance':120})

In [216]:
# doc = corpus[ITEM]
# # for doc in corpus[:1]:
# #     print(doc)
# for token in doc:
#     # if token.dep_ == 'ccomp':
#     print(token.text, token.pos_, token.norm_, token.tag_ ,token.dep_, [child.text for child in token.children], [parent.text for parent in token.ancestors])

## Sample for Evaluation

Checking
1603 They try to charge me $5 to replace my debit card that’s not working and my card keeps getting declined everywhere when I have the funds available in my account.
	 []

Credit Cards
3226 This was the first credit card that I have ever received. It was a pleasant surprise to actually be approved for the card in the first place seeing that I was a new college student with no prior credit, and to make matters worse it was shortly after the economic collapse and no one was really giving out credit.After receiving the card it was everything that I could have asked for. I was started on a basic 500$ limit, which was all that I was really looking for. I also did have a points based reward program which turned out to be very beneficial. Every experience with Bank of America has been positive, and there is really nothing negative that I could say.
	 [('approved surprise', 'pleasant', 0.802, 1), ('credit', 'prior', 0.3818, 1), ('collapse', 'economic', -0.4939, 1), ('turned reward pr

# MAP Aspect Based Sentiment to DICTIONARY 
mapping common words captured by algo to match main aspects: Customer service, Interest Rate etc. #.

1. The aspects dictionary should contain all the hardcoded words we should identify for the main categories given.

2. Func aggregates sentiment score per record and puts them into a dictionary

In [236]:
aspects = {
    'Credit Starter':['repair', 'damaged'],
    'Customer Service':['representative','manager', 'teller', 'banker', 'lady', 'customer', 'service', 'management', 'english', 'operations', 'agents'],
    'Interest Rates': ['rate', 'interest', 'accrued'],
    'Online Banking Services': ['bills', 'tool', 'monitor', 'app', 'platform', 'online', 'menu'],
    'Rewards': ['bonus', 'offers', 'categories', 'reward', 'rewards', 'points', 'miles', 'cash', 'back'],
    'Fees': ['fee', 'fees', 'charge', 'charges'],
    'Security': ['fraud', 'dispute', 'security', 'seller'],
    'Retail Branch': ['branch',  'location',  'store', 'distance']
}

In [221]:
aspects.items()

dict_items([('Credit Starter', ['repair', 'damaged']), ('Customer Service', ['representative', 'manager', 'teller', 'banker', 'lady', 'customer', 'service', 'management', 'english', 'operations', 'agents']), ('Interest Rates', ['rate', 'interest', 'accrued']), ('Online Banking Services', ['bills', 'tool', 'monitor', 'app', 'platform', 'online', 'menu']), ('Rewards', ['bonus', 'offers', 'categories', 'reward', 'rewards', 'points', 'miles', 'cash', 'back']), ('Fees', ['fee', 'fees', 'charge', 'charges']), ('Security', ['fraud', 'dispute', 'security', 'seller']), ('Retail Branch', ['branch', 'location', 'store', 'distance'])])

In [268]:
## Create function that will create main category scores
# if aspect falls into selected categories -extract score into records dictionary
sample = results[:500]


def aggregate_sentiment_scores(scores, aspects) -> list[dict]:
    all_scores = list()
    # TODO: what if multiple mentions same aspect?

    for result in results:
        scores = {
            'Credit Starter':np.nan,
            'Customer Service': np.nan,
            'Interest Rates': np.nan,
            'Online Banking Services': np.nan,
            'Rewards':np.nan,
            'Fees':np.nan,
            'Security':np.nan,
            'Retail Branch': np.nan
        }
        # get result pair
        for pair in result:
            item = pair[0]
            # for aspect word in item
            for word in item.split():
                # for overall aspect categories
                for cat, words in aspects.items():
                    # check if word in result pair is in aspect words
                    if word.strip('s') in words:
                        # add score if none existent
                        if scores[cat] is np.nan:
                            scores[cat] = 0
                        scores[cat] = scores.get(cat, 0) + pair[2]
        all_scores.append(scores)
    return all_scores

    
agg_scores = aggregate_sentiment_scores(results, aspects)

# CREATE MASTER DATAFRAME
We create a master dataframe of records and add their corresponding selected scores


| bank | product | date | text | selected aspect scores * |
-

In [253]:
corpus[0]._.meta

{'date': '2021-09-08',
 'year': 2021,
 'bank': 'PNC',
 'product': 'Credit Cards',
 'stars': 5}

In [259]:
def create_df_from_spacy(corpus):
    records = list()
    for doc in corpus:
        base = doc._.meta
        base['text'] = doc.text
        records.append(base)
    return records

df = create_df_from_spacy(corpus)

In [271]:
df = pd.DataFrame(df)
agg_scores = pd.DataFrame(agg_scores)

df = pd.concat([df, agg_scores], axis=1)

Our scoring function is really sparse out of 23,000 records only 6-2.6K values are filled

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23287 entries, 0 to 23286
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     23287 non-null  object 
 1   year                     23287 non-null  int64  
 2   bank                     23287 non-null  object 
 3   product                  23287 non-null  object 
 4   stars                    23287 non-null  int64  
 5   text                     23287 non-null  object 
 6   Credit Starter           6 non-null      float64
 7   Customer Service         2462 non-null   float64
 8   Interest Rates           2692 non-null   float64
 9   Online Banking Services  412 non-null    float64
 10  Rewards                  1748 non-null   float64
 11  Fees                     1490 non-null   float64
 12  Security                 116 non-null    float64
 13  Retail Branch            56 non-null     float64
dtypes: float64(8), int64(2

In [272]:
df

Unnamed: 0,date,year,bank,product,stars,text,Credit Starter,Customer Service,Interest Rates,Online Banking Services,Rewards,Fees,Security,Retail Branch
0,2021-09-08,2021,PNC,Credit Cards,5,The PNC cash rewards card is a great first cre...,,,,,,,,
1,2021-09-07,2021,PNC,Mortgages,1,Stay as far away from this lender as possible....,,,,,,,,
2,2021-09-06,2021,PNC,Credit Cards,5,I like everything about it don't want to chang...,,,,,,,,
3,2021-09-02,2021,PNC,Credit Cards,5,Good card....0 percent interest for first year...,,,,,,,,
4,2021-09-02,2021,PNC,Business Services,2,Local branch very Hard to reach by phone. Gene...,,,,,,,,-0.0552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23282,2012-06-04,2012,TD Bank,Checking,5,TD Bank is the absolute best bank that I have ...,,-0.1027,,,,,,
23283,2012-06-04,2012,TD Bank,Checking,5,I have a personal checking account at TD and h...,,,,,,,,
23284,2012-05-30,2012,TD Bank,Checking,5,I am a customer of TD Bank and I think they ar...,,,,,,,,
23285,2012-05-30,2012,TD Bank,Checking,5,I have been using TD Bank for two years now. I...,,,0.0516,,,-0.2732,,


In [277]:
df.iloc[23282].text

"TD Bank is the absolute best bank that I have ever done business with. The fact that it is the only major bank that is open until nine o clock and open seven days a week should tell you how much they care about their customers. It took me less than fifteen minutes to walk into a branch and start up my first account with them. The customer service is amazing and it is always a pleasure to talk to them. I'm a huge fan of the coin machine that is located in every branch. TD Bank knows how hard its customers work for their money and they make sure that every single red cent is held with the highest regard."

In [290]:
results[23282]

[('done bank', 'best', 0.6369, 1),
 ('fan', 'huge', 0.5574, 1),
 ('customers', 'how hard', -0.1027, 2)]

1. It appears that the algorithm needs to be improved to accurately represent the sentiment regarding each given aspect

2. words mapping needs to be improved to capture more sentiment scores, so that we can have better representation of scores

In [291]:
df.to_csv('example_output.csv')