In [None]:
"""libraries to install for running"""
# !pip install ipywidgets                   # Used to select different product/bank combinations to visualize
# !pip install textacy                      # Used to quickly convert large document corpus to spacy documents
# !pip install selenium                     # Used to scrape wallethub reviews

In [36]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import  Options
import time
import datetime
from random import uniform, choice

from collections import Counter
from wordcloud import WordCloud

import pandas as pd
import numpy as np
import pathlib
import re
import ipywidgets as widgets
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import textacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from spacy import displacy

# Data Sourcing - Scrapping
Method used to scrape wallethub data for each bank

**Note:** Wallethub has restrictions on the amount of hits allowed to their website which will lead to your IP being blocked if these are exceeded. Proxy IP addresses can be used to overcome this, however this is a time consuming process that took several days due to these blocks.


## Helper Functions

In [2]:
def get_proxies()-> list:
    """Function gets a list of free IP proxies that can be used for scraping to avoid blocks.

    Returns:
        list: returns a list of IP proxies to try
    """
    url = "https://www.us-proxy.org//"
    # get the HTTP response and construct soup object
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    proxies = []
    for row in soup.find("table").findAll("tr"):
        tds = row.find_all("td")
        # check allows https and is elite class
        try:
            # print(tds[6], tds[4])
            if tds[6].text == 'yes' and tds[4].text == 'elite proxy':
                ip = tds[0].text.strip()
                port = tds[1].text.strip()
                host = f"{ip}:{port}"
                proxies.append(host)
        except IndexError:
            continue
    return proxies

def select_proxy(proxies:list) -> str:
    """Randomly choses proxy from list

    Args:
        proxies (list): list of proxies

    Returns:
        str: chosen proxy
    """
    proxy = choice(proxies)
    return proxy

In [5]:
def create_driver_session(bank:str="Wells Fargo",headless:bool=True) -> object:
    """Creates a driver with a functioning IP proxy for scrapping.

    Args:
        bank (str, optional): Bank selected for scraping. Defaults to "Wells Fargo".
        headless (bool, optional): Whether to show actions on Chrome Webbrowser. Defaults to True.

    Returns:
        object: driver session
    """
    # Get Free Proxy
    proxies = get_proxies()
    proxy = select_proxy(proxies) # IP:PORT or HOST:PORT
    print('Proxy IP', proxy)
    # Set Proxy address to fetch new data
    options = Options()
    options.add_argument('--proxy-server=%s' % proxy)
    # No need to see page
    options.headless = headless
    # Create driver
    driver = webdriver.Chrome(DRIVER_PATH, options=options)
    # Testing it is functioning URL
    try:
        driver.implicitly_wait(10)
        driver.get(url)
        if bank not in driver.title:
            print("Did not reach review page - blocked")
            driver.quit()
            create_driver_session(bank=BANK,headless=False)
    except Exception as e:
        print('Driver Creation Failed')
        driver.quit()
        create_driver_session(bank=BANK,headless=False)
    finally:    
        return driver

In [8]:
def get_html_file(url:str, driver:object, page:int=None) -> object:
    """Fetches html page of reviews.

    Args:
        url (str): path for html
        driver (object): created drive for extraction
        page (int, optional): page number of reviews. Defaults to None.

    Returns:
        BeautifulSoup: soup of html data used for parsing
    """
    if page != None:
        url += "?p=" + page
    # inserting random time lapse to try to prevent bot detection
    random_value = uniform(1.5,4.0)
    print('Waited', random_value, 'seconds')
    time.sleep(random_value) 
    # fetch website
    driver.get(url)
    # get html
    soup = BeautifulSoup(driver.page_source, 'lxml', from_encoding='UTF-8')
    return soup

In [9]:
def fetch_page_reviews(html:object, df:pd.DataFrame) -> pd.DataFrame:
    """Fetches all the page reviews from a given html and appends them to a dataframe

    Args:
        html (BeautifulSoup): Html soup used for extraction
        df (pd.DataFrame): empty dataframe for reviews

    Returns:
        pd.DataFrame: filled dataframe with reviews
    """
    for review in html.find_all(class_='rvtab-citem'):  
        # Initialize Values
        stars = 0
        verified = True
        review_comment = False
        # Name & UserName
        author_name = review.find(class_='rvtab-ci-name').text.strip()
        nickname = review.find(class_='rvtab-ci-nickname').text[1:]
        # Date of Review
        date = review.find('time')['datetime']
        # Product Reviewed
        if review.find(class_='rvtab-ci-category') is not None:
            product = review.find(class_='rvtab-ci-category').text.replace('Product:','').strip()
        else:
            # prints comments that don't belong to specific product, probably comments to previous review
            print(author_name, nickname, date, verified, stars)
            review_comment = True
            product = None
        # Review Comment
        comment = review.find(class_='rvtab-ci-content').text
        # Count number of stars
        for star in review.find_all('path'):
            if star['fill'] == '#4ae0e1':
                stars += 1
        if review.find(class_='rvtab-ci-verified') is None:
            verified = False
        # temporary to add row by row
        row = [[date, author_name, nickname, verified, product, review_comment, stars, comment]]
        temp = pd.DataFrame(row, columns=df.columns)
        # add to main dataframe
        df = df.append(temp, ignore_index=True)
    return df

## Main Function: Scrapper

In [10]:
def scrape_reviews(url:str, driver:object, start_page:int=None, data:pd.DataFrame=None) -> set[pd.DataFrame, str]:
    """Main function used to scrape all the data for bank webpage. If the scraping attempt fails more than 3 times the function retuns the last page and all the data scrapped up to that point

    Args:
        url (str): path for website html
        driver (object): chromedriver used for extraction
        start_page (int, optional): page of reviews to start from. Defaults to None.
        data (pd.DataFrame, optional): dataframe used for storage. Defaults to None.

    Returns:
        set[pd.DataFrame, str]: dataframe with review data scrapped
    """
    if data is None:
        # Create DataFrame for storage
        col_names = ['date', 'name', 'user_id', 'verified', 'product', 'review_comment', 'stars', 'review']
        data = pd.DataFrame(columns=col_names)
    # attempts counter
    attempts = 0
    # fetch starting page
    html = get_html_file(url, driver)
    # get page positions
    page_position = html.find(class_='rvtab-pag-pos').text
    if start_page is None:
        current_page = page_position.split()[0]
    else:
        current_page = str(start_page)
    last_page = page_position.split()[2]
    while int(current_page) <= int(last_page):
        print(current_page)   
        try:
            html = get_html_file(url, driver, current_page)
        except Exception as e:
            if attempts > 3:
                break
            print('Exception trigerred:', e)
            driver = create_driver_session(headless=False)
            attempts += 1
        # check comment review exists in page - else bot may be blocked
        if html.find_all(class_='rvtab-citem'):
            # add page reviews
            data = fetch_page_reviews(html, data)
            # get new page number
            current_page = html.find(class_='rvtab-pag-pos').text.split()[0]
            # increase page
            current_page = str(int(current_page) + 1)
        elif html.title.text == 'IP Block':
            print('No more comments retrieved - Bot Blocked')
            if attempts > 3:
                break
            attempts += 1
    driver.close()
    return data, current_page

## Dictionary of Banks & Chrome Driver

In [11]:
# Disctionary of Bank - url post-fix
Bank_urls = {
    "Wells Fargo":"wells-fargo-13007950i",
    "Bank of America":"bank-of-america-13000450i",
    "Chase": "chase-13001251i",
    "Citi": "citibank-13001291i",
    "Capital One": "capital-one-13001087i",
    "TD Bank": "td-bank-13006307i",
    "PNC":"pnc-13005045i",
    "U.S. Bank":"us-bank-13007637i"}


# Path for Chomedriver
DRIVER_PATH = "/Users/jesidacosta/OneDrive - University of South Florida/ISM6930/group_project/chromedriver"

In [12]:
# Create DataFrame for storage
col_names = ['date', 'name', 'user_id', 'verified', 'product', 'review_comment', 'stars', 'review']
data = pd.DataFrame(columns=col_names)

## Create Driver

In [None]:
# start driver with Proxy IP for scrapping
driver = create_driver_session(bank=BANK,headless=False)

# Scrapping Manually
Run scrapping function for each bank with supervision. 

If the IP gets blocked up before finishing: 
1. get a new driver that is working. 
2. set starting page to the last page returned by the scrape function

In [None]:
# Select Bank to Scrape -- must match dictionary keys below
BANK = "Chase"
# Set Page number to start from -- change to 'page' if stopped midway
PAGE = None
# Create Url path for extraction
url = "https://wallethub.com/profile/" + Bank_urls[BANK]

# Run S
data, page = scrape_reviews(url=url, driver=driver, start_page=PAGE)

## Save Data 
Saves into a data folder

In [161]:
# Get date
DATE = datetime.datetime.now().date()
# Save Document as JSON file
data.to_json(f'./data/{BANK}_wallethub_reviews_{DATE}.json', orient='records')

# Create Unified Dataframe
Goes into the data folder where all json file of reviews are stored and created a dataframe from them

In [59]:
# Create Dataframe
columns =['date', 'name', 'user_id', 'verified', 'product', 'review_comment','stars', 'review']
df = pd.DataFrame(columns=columns)
# Insert JSON files into unified dataframe
for path in pathlib.Path(pathlib.os.getcwd(),'data').glob('*.json'):
    temp = pd.read_json(path)
    df = df.append(temp)

# START FROM HERE IF NOT SCRAPPING DATA

In [61]:
# Read Data from CSV provided
df = pd.read_csv('all_bank_reviews.csv')

# Cleaning Data

## Removing non-relevant samples

In [15]:
"""Removing comments posted about reviews and or content posted by Site"""
print('Size before {}'.format(df.shape[0]))
df = df[df.name.ne('WalletHub') & df.review_comment.ne(True)].reset_index(drop=True)
print('Size after {}'.format(df.shape[0]))

Size before 24732
Size after 23583


## Assigning Bank Labels

In [16]:
banks = ['Wells Fargo', 'Bank of America', 'Citibank', 'Chase', 
         'PNC', 'TD Bank', 'Capital One', 'U.S. Bank']

# If the product mentions any of the banks selected set bank label for each review
for bank in banks:
    df.loc[df['product'].str.contains(bank), 'bank'] = bank
df.head()

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review,bank
0,2021-09-08,katelyn,katelyn_leifert,True,PNC Credit Cards,False,5,The PNC cash rewards card is a great first cre...,PNC
1,2021-09-07,Jessica K,jessicak8652,False,PNC Mortgages,False,1,Stay as far away from this lender as possible....,PNC
2,2021-09-06,Doris,dorish_33,True,PNC Credit Cards,False,5,I like everything about it don't want to chang...,PNC
3,2021-09-02,Virgil,virgilw_7,True,PNC Credit Cards,False,5,Good card....0 percent interest for first year...,PNC
4,2021-09-02,shloymie,shloymie,False,PNC Business Services,False,2,Local branch very Hard to reach by phone. Gene...,PNC


## Cleaning Product Categories
Removes the names of the banks in the product name - standardizes main products

In [17]:
# Removing bank names from product column
pattern = re.compile(" |".join(banks) + " ")
df['product'] = df['product'].apply(lambda text: re.sub(pattern=pattern, repl="", string=text))

In [18]:
# See count of different product reviews
df['product'].value_counts()

Credit Cards         17193
Checking              3965
Car Loans              758
Savings & CDs          663
Mortgages              389
Business Services      262
Prepaid Cards          105
Personal Loans          87
Home Equity             49
Savings                 48
Student Loans           33
Investments             22
CDs                      7
CD Rates                 1
Conventional             1
Name: product, dtype: int64

## Combining main product categories
Combines related product categories together

In [19]:

def convert_to_product(old_product:str, new_product:str) -> None:
    """Converts old product to a new or already exiting product category

    Args:
        old_product (str): name of old product to be converted
        new_product (str): new or existing new to product to be assigned
    """
    # affected rows
    rows = df[df['product'].eq(old_product)].shape[0]
    rows_new = df[df['product'].eq(new_product)].shape[0]
    print(f'Before -> Rows {rows} in {old_product}, {rows_new} in {new_product}')

    # move to new category
    df.loc[df['product'].eq(old_product), 'product'] = new_product

    # affected rows
    rows = df[df['product'].eq(old_product)].shape[0]
    rows_new = df[df['product'].eq(new_product)].shape[0]
    print(f'After  -> Rows {rows} in {old_product}, {rows_new} in {new_product}')

# Moving Conventional
convert_to_product('Conventional', 'Credit Cards')

# Moving CDs
convert_to_product('CDs', 'Savings & CDs')

# Moving CD Rates
convert_to_product('CD Rates', 'Savings & CDs')

# Moving Savings
convert_to_product('Savings', 'Savings & CDs')

Before -> Rows 1 in Conventional, 17193 in Credit Cards
After  -> Rows 0 in Conventional, 17194 in Credit Cards
Before -> Rows 7 in CDs, 663 in Savings & CDs
After  -> Rows 0 in CDs, 670 in Savings & CDs
Before -> Rows 1 in CD Rates, 670 in Savings & CDs
After  -> Rows 0 in CD Rates, 671 in Savings & CDs
Before -> Rows 48 in Savings, 671 in Savings & CDs
After  -> Rows 0 in Savings, 719 in Savings & CDs


## Removing small sample products

Removes products with less than 250 reviews

In [20]:
# REMOVE product categories with counts less than 250 --> remaining products would be 6
df = df.groupby('product').filter(lambda x: len(x) > 250)

# See new counts of remaining products
df['product'].value_counts()

Credit Cards         17194
Checking              3965
Car Loans              758
Savings & CDs          719
Mortgages              389
Business Services      262
Name: product, dtype: int64

## Adding year column

In [21]:
# Creating year variable of comment
df['year'] = df.date.dt.year

# Sub-sampling for Analysis

In [None]:
# Selecting small data slice for testing
SUB_SAMPLE = False

if SUB_SAMPLE:
    df = df.sample(frac=.20)

print('Data size: {} rows'.format(df.shape[0]))

Data size: 23287 rows


# Saving Text Corpus to Textacy Corpus
Textacy Corpus allows us to quickly parse through multiple documents

Remove comment from create_corpus function execution to create a new corpus

In [22]:
# spacy library
library = 'en_core_web_sm'

# initializing nlp
nlp = spacy.load(library)

In [23]:
## function text to spacy/textacy corpus
def create_corpus(df:pd.DataFrame, library:str='en_core_web_sm') -> textacy.Corpus:
    """Creates a textacy corpus with a spacy doc and meta `{'date', 'bank', 'product', 'stars'}`

    Args:
        df (pd.DataFrame): dataframe with reviews information
        library (str, optional): spacy pipeline used as for text processing. Defaults to 'en_core_web_sm'.

    Returns:
        textacy.Corpus: Set of spacy docs and meta
    """
    # converting date to string for serialization
    df['date'] = df['date'].astype(str)

    # organize dictionary to format ['review', {meta:1, ...}]
    # convert pandas to dictionary only of meta columns
    columns = ['date', 'year', 'bank', 'product', 'stars']
    meta = df[columns].to_dict(orient='records')
    # convert reviews series to list of reviews
    reviews = df['review'].tolist()
    # zip into records (review, {meta})
    records = list(zip(reviews, meta))
    # creating base corpus
    corpus = textacy.Corpus(lang=library)
    # adding records (batch and n_process speeds loading)
    corpus.add_records(records=records, batch_size=50, n_process=-1)
    return corpus

# creating corpus
# corpus = create_corpus(df=df, library=library)


## Saving

In [24]:
# Saving 
# corpus.save('./data/bank_reviews.bin.gz')

## Loading


In [25]:
# Loading - loading directly from bin file for faster execution 
# corpus = textacy.Corpus.load(lang=library, filepath='./data/bank_reviews.bin.gz')
# print(corpus)

Corpus(23287 docs, 2495582 tokens)


In [26]:
# Function to return
def clean_words(doc:spacy.tokens.doc.Doc, excluded:list=[], pos='None') -> list:
    """Cleans text into a list of words removing punctuations, stopwords, numbers and list of excluded words.

    Args:
        doc (spacy.tokens.doc.Doc): Spacy text doc
        excluded (list, optional): list of words to exclude. Defaults to [].
        pos (str, optional): Part of speech to select. Defaults to 'None'.

    Returns:
        list: cleaned words 
    """
    if pos == 'None':
        words = [token.lemma_.lower() for token in doc if (not token.is_punct and 
                                            not token.is_stop and 
                                            token.pos_ != 'NUM' and
                                            token.lemma_ not in excluded)]
    else:
        words = [token.lemma_.lower() for token in doc if (not token.is_punct and 
                                                   not token.is_stop and 
                                                   token.lemma_ not in excluded and
                                                   token.pos_ != 'NUM' and
                                                   token.pos_ == pos)]    # conditional for POS type
    return words
    

In [32]:
# Function to clean Documents for selected bank and product, extracting POS feature selected
pos = None
bank = 'Bank of America'
product = 'Business Services'

def clean_docs(product:str, bank:str='All', year='None', pos:str='None', corpus:textacy.Corpus=corpus) -> list:
    """Generates a filter for the bank-product selected, a list of excluded words and iteratively cleans spacy docs in the given corpus.

    Args:
        product (str): bank product to be cleaned
        bank (str, optional): bank selected. Defaults to 'All'.
        year (str | int): year selected
        pos (str, optional): part of speech selected . Defaults to 'None'.
        corpus (textacy.Corpus, optional): bank corpus to be cleaned. Defaults to corpus.

    Returns:
        list: cleaned words for all documents of given bank-product
    """
    # Create empty list of words
    words = list()
    # filter review type by product & bank
    if bank == 'All':
        if year == 'None':                   
            def filter(doc): return doc._.meta['product'] == product                                  # all banks
        else:
            def filter(doc): return doc._.meta['product'] == product and doc._.meta['year'] == year 
    else:
        if year == 'None':                   
            def filter(doc): return doc._.meta['product'] == product and doc._.meta['bank'] == bank   # show only selected bank
        else:
            def filter(doc): return doc._.meta['product'] == product and doc._.meta['bank'] == bank  and  doc._.meta['year'] == year
    # Update excluded words
    excluded = [' ', 'bank', 'account', 'tell', 'year', 'day', '$','hold', 'know', 'go', 'time', 'month']
    # Add bank name and product name from excluded words
    excluded += [name for name in bank.split()] + [name.rstrip('s') if name not in ('business', 'services') else name for name in product.lower().split()]
    if bank == 'Bank of America': 
        excluded += ['boa']
    # filtering docs
    selected_docs = corpus.get(filter)
    # iterate through docs
    for doc in selected_docs:
        top_words = clean_words(doc, excluded=excluded, pos=pos)
        # apped to list of all docs
        words.extend(top_words)
    return words

# example words
clean_docs(product, bank)[:5]

['bad', 'purchase', 'later', 'overdraw', 'night']

# Exploratory Data Analysis

Bar Chart, Word Cloud and Time-Varying Most Common Words requires cells be ran to display visualizations

## Bar Chart: Top Words/Nouns/Adjs

Allows us to see the most common nouns and adjectives used for each product. We can use NOUNS as proxy for the related `aspects` and ADJECTIVES as proxy for `opinions`

In [33]:
# 1. create word count chart

# List of unique products
products = df['product'].unique().tolist()

# Creating Dropdown menu with products
product_dropdown = widgets.Dropdown(
    options= [prod for prod in products],
    value='Credit Cards',
    description='Bank Prod:')

# Creating Dropdown menu with banks
bank_dropdown  = widgets.Dropdown(
    options= banks + ['All'],
    value='All',
    description='Bank:')

# Function to update bar chart
def update_plot(bank:str, product:str, pos:str='NOUN', top_words:int=5) -> None:
    """Plots top 10 words or POS mentioned in the reviews for a given bank-product
        - product: selected feature from drop down

    Args:
        bank (str): bank from drop down.
        product (str): selected product from drop down. Defaults to 'NOUN'
        pos (str): part of speech selected.
        top_words (int): number of words to see
    """
    # fetch clean words
    words = clean_docs(product=product, bank=bank, pos=pos)
    # get top words as dict
    words_dict = {wc[0]:wc[1] for wc in Counter(words).most_common(top_words)}
    # words_dict = reviews[product]['words']
    plt.figure(figsize=(16,8))
    plt.style.use("dark_background")
    sns.barplot(x=list(words_dict.keys()),
    y=list(words_dict.values()))
    name = pos.capitalize() if pos != 'None' else 'Word'
    plt.title(f'Top {top_words} {name}s for {product}')
    plt.ylabel('Counts')
    plt.xlabel(name + 's')
    plt.show()

# Shows interactive plot to select feature and plot top 10 hotels
widgets.interactive(update_plot, bank=bank_dropdown, product=product_dropdown, pos=[ 'NOUN', 'ADJ','None'], top_words=5)

interactive(children=(Dropdown(description='Bank:', index=8, options=('Wells Fargo', 'Bank of America', 'Citib…

## Word Cloud for Words/Nouns/Adj

Allows us to see the most common nouns and adjectives used for each product. We can use NOUNS as proxy for the related `aspects` and ADJECTIVES as proxy for `opinions`

In [34]:
# Creating Dropdown menu with products
product_dropdown = widgets.Dropdown(
    options= products,
    value='Credit Cards',
    description='Bank Prod:')

# Creating Dropdown menu with banks
bank_dropdown  = widgets.Dropdown(
    options= banks + ['All'],
    value='All',
    description='Bank:')


def make_cloud(bank:str, product:str, pos:str='NOUN',  word_max:int=100) -> None:
    """Creates a word cloud for given bank, product and POS with a maximun number of words selected.

    Args:
        bank (str): bank selected
        product (str): product selected
        pos (str, optional): part of speech seelcted. Defaults to 'NOUN'.
        word_max (int, optional): maximun number of words to view. Defaults to 100.
    """
    # get product text
    # text = reviews[product]['text']
    # clean text
    cleaned_text = ' '.join(clean_docs(product, bank, pos=pos))
    plt.figure(figsize=(16,8))
    wordcloud = WordCloud(background_color='white', max_words=word_max).generate(cleaned_text)
    plt.style.use("dark_background")
    name = pos.capitalize() + 's' if pos != 'None' else 'Words'
    plt.title(f'Most used {name}: {bank}, {product}', fontsize=32)
    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")
    

widgets.interact(make_cloud, bank=bank_dropdown, product=product_dropdown, pos=['NOUN', 'ADJ', 'None'], word_max=[20,50,100,200])
# -> Make these interactive

interactive(children=(Dropdown(description='Bank:', index=8, options=('Wells Fargo', 'Bank of America', 'Citib…

<function __main__.make_cloud(bank: str, product: str, pos: str = 'NOUN', word_max: int = 100) -> None>

## Time-Varying most common Noun
Which is the most common noun per year?

In [38]:
def plot_overtime(bank:str, product:str, pos:str='NOUN'):
    # initialize dictionry for top word
    words_per_year = dict()
    # list of years to check
    years = list(range(2011, 2022)) 
    for year in years:
        words = clean_docs(product=product, bank=bank, year=year, pos=pos)
        most_common = {year:{wc[0]:wc[1]} for wc in Counter(words).most_common(1)}
        words_per_year.update(most_common) 
    # year
    x = list(words_per_year.keys())
    # list to hold count and word
    y, word = list(), list()
    # iterate fetching top word and count
    for dic in words_per_year.values():
        for k, v in dic.items():
            y.append(v)
            word.append(k)
    plt.figure(figsize=(16,8))
    # create a figure variable so each bar can be measured
    bars = plt.bar(x, height=y, width=.4)
    xlocs, xlabs = plt.xticks()
    xlocs=[i for i in x]
    xlabs=[i for i in x]
    plt.xticks(xlocs, xlabs)
    # plot name on top of each bar
    for i, bar in enumerate(bars):
        yval = bar.get_height()
        plt.text(bar.get_x(), yval + 5, word[i])
    # Set names & Titles
    name = pos.capitalize() + 's' if pos != 'None' else 'Words'
    plt.title(f'Top {name} for {product} Overtime')
    plt.ylabel('Count')
    plt.xlabel(name)
    plt.show()
    
widgets.interact(plot_overtime, bank=bank_dropdown, product=product_dropdown, pos=['NOUN', 'ADJ', 'None'])

interactive(children=(Dropdown(description='Bank:', index=2, options=('Wells Fargo', 'Bank of America', 'Citib…

<function __main__.plot_overtime(bank: str, product: str, pos: str = 'NOUN')>

# Modeling
## Rule-Based Model

In [40]:
# Start analyzer
analyzer = SentimentIntensityAnalyzer()

def add_compound(token:object, aspect:str) -> str:
    """Adds compound noun if aspect is made up of multiple nouns

    Args:
        token (object): spacy word used for parsing
        aspect (str): already identified aspect txt to be appended to

    Returns:
        str: compound aspect if applicable
    """
    for child in token.children:
        # if child is compound then add as prefix
        if child.dep_ == 'compound' and aspect != '99999':
            aspect = child.norm_ + " " + aspect
        # if child clause modifier add to Aspect
        if child.dep_ == 'relcl':
            aspect = child.norm_ + " " + aspect
    return aspect

def add_negative(token:object, modifier:str) ->str:
    """Identifies if a negative modifier is used and inserts to modifer word

    Args:
        token (object): token to be examined for negation
        modifier (str): already identified modifier to be appended to

    Returns:
        str: negative modifier if applicable
    """
    if(token.dep_ == "neg" and modifier != "99999"):
        neg_prefix = token.norm_ if token.norm_ != "n't" else "not"
        modifier = neg_prefix + " " + modifier
    if(token.dep_ == "det" and token.norm_ == 'no' and modifier != "99999"):
        neg_prefix = 'no'
        modifier = neg_prefix + " " + modifier
    return modifier


def get_aspect_level_sentiments(doc:object) -> list[set]:
    """Main function used to extract aspect nounts and score sentiment of aspect-noun phrase.
        
        - Rule 1: Opinion is `amod` and Aspect is it's head

        - Rule 2: Aspect is `nsubj`, Opinion is `acomp`

        - Rule 3: Aspect is `nsubj`, Opinion is `attr`

        - Rule 4: Aspect is `nsubj` or `nsubjpass`, Opinion is `advmod`

        - Rule 5: Aspect is `dobj` && `NOUN`, Opinion is `advmod`
    Args:
        doc (object): review to be examined

    Returns:
        list[set]: list of review scores in sets made up of {Aspect, Modifier, SentimentScore, RuleNumber}
    """

    ## FIRST RULE OF DEPENDANCY PARSE
    rule_pairs = []

    for token in doc:
        A = "99999"               # aspect
        O = "99999"               # opinion
        if token.dep_ == "amod" and not token.is_stop:
            O = token.norm_
            A = token.head.norm_
            if token.head.dep_ == 'dobj':
                # print(token.head.head)
                if token.head.head.dep_ == 'advcl':
                    # print(token.head.head.head)
                    O = token.head.head.head.norm_ + " " + token.head.head.norm_ + " " + O
                else:
                    O = token.head.head.norm_ + " " + O
            for child_m in token.head.head.children:
                O = add_negative(child_m, O)
                for child_m2 in child_m.children:
                    O = add_negative(child_m2, O)

            # advervial modifiers (most refreshing lotion)
            O_children = token.children
            for child_m in O_children:
                if (child_m.dep_ == "advmod"):
                    O_hash = child_m.norm_
                    O = O_hash + " " + O

                O = add_negative(child_m, O)

            # negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
            A = add_compound(token.head, A)

        if(A != "99999" and O != "99999"):
            rule_pairs.append((A, O, analyzer.polarity_scores(O + " " + A)['compound'],1))

    ## TWO RULE OF DEPENDANCY PARSE -
    for token in doc:

        children = token.children
        A = "99999"
        O = "99999"
        add_neg_pfx = False
        for child in children :
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.norm_
                # check_spelling(child.norm_)
                for child_two in child.children:
                    if child_two.dep_ == "compound":
                        A = child_two.norm_ + " " + A

            if(child.dep_ == "acomp" and not child.is_stop):
                O = child.norm_
                children_two = child.children
                for child_two in children_two:
                    if child_two.dep_ == 'advmod':
                        O = child_two.norm_ + " " + child.norm_
                    else:
                        O = child.norm_
            O = add_negative(child, O)

        if(A != "99999" and O != "99999"):
            rule_pairs.append((A, O, analyzer.polarity_scores(O + " " + A)['compound'],2))


    ## THIRD RULE OF DEPENDANCY PARSE -
    for token in doc:
        children = token.children
        A = "99999"
        O = "99999"
        add_neg_pfx = False
        for child in children :
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.norm_

            if((child.dep_ == "attr") and not child.is_stop):
                O = child.norm_

            O = add_negative(child, O)

        if(A != "99999" and O != "99999"):
            rule_pairs.append((A, O,analyzer.polarity_scores(O + " " + A)['compound'],3))        


    ## FOURTH RULE OF DEPENDANCY PARSE -
    #Assumption - A verb will have only one NSUBJ and DOBJ
    for token in doc:


        children = token.children
        A = "99999"
        O = "99999"
        add_neg_pfx = False
        for child in children :
            if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
                A = child.norm_

            if(child.dep_ == "advmod" and not child.is_stop):
                O = child.norm_
                O_children = child.children
                for child_m in O_children:
                    if(child_m.dep_ == "advmod"):
                        O_hash = child_m.norm_
                        O = O_hash + " " + child.norm_

            O = add_negative(child, O)

        if(A != "99999" and O != "99999"):
            rule_pairs.append((A, O,analyzer.polarity_scores(O + " " + A)['compound'],4)) # )

    # FIFTH RULE 
    for token in doc:
        A = "99999"               
        O = "99999"               
        # Adding aspect
        if token.dep_ == 'dobj' and token.pos_ == 'NOUN':
            A = token.norm_
        # Adding compound to aspect 
        A = add_compound(token=token, aspect=A)
        # Get opinion if matching dep and pos
        for child in token.children:
            if child.dep_ in ('advmod') and child.pos_ in ('ADV'):
                O = child.norm_
            O = add_negative(child, O)
            
        if(A != "99999" and O != "99999"):
            rule_pairs.append((A, O, analyzer.polarity_scores(O + " " + A)['compound'],5))
    
    # Removing pairs that do not have sentiment
    rule_pairs = [(A,O,P,r) for (A,O,P,r) in rule_pairs if P != 0]

    return rule_pairs


### Predicting ABSA values

In [42]:
def predict_absa(corpus:textacy.Corpus) -> list[set]:
    """Makes ABSA predictions given a corpus of reviews using defined aspect_level function.

    Args:
        corpus (textacy.Corpus): corpus of reviews to make predictions

    Returns:
        list[set]: list of ABSA results in format {Aspect, Modifier, SentimentScore, RuleNumber}
    """
    results = list()
    for i, doc in enumerate(corpus):
        aspects = get_aspect_level_sentiments(doc)
        results.append(aspects)
    print('Fitted docs:',len(results))
    return results

results = predict_absa(corpus)

Fitted docs: 23287


# Selecting Sample for Evaluation
We manually used a sample of 30 reviews to extract and score.

The format of the file is as follows:

Index: Review
- list of {Aspect, Opinion, SentimentScore, RuleNumber}

In [49]:
# np.random.seed(42)
# choices = np.random.choice(range(len(corpus)),size=30)
# choices

# Seed did not work in subsequent runs so we manually selected by index for reproduction
choices = [1603, 3226, 13381, 3189, 17654, 19797, 19639, 6077, 5168, 
           7417, 22421, 21576, 428, 501, 1201, 14108, 8679, 22716, 
           9876, 367, 20665, 16677, 15017, 167, 5339, 22228, 16510, 2789, 7403, 12701]

# Saving outputs to text document for manual scoring
with open("sample_evaluation.txt", "w") as file:
    file.write('Evaluation Results\n')
    for choice in choices:
        file.write(f"{choice}: {corpus[choice]} \n")
        file.write(f"\t {get_aspect_level_sentiments(corpus[choice])} \n\n")

# Map Aspect Based Sentiments to dictionary of categories
Mapping key aspect words captured by algorithm to match main aspects: Customer service, Interest Rate, etc.

1. The aspects dictionary should contain all the hardcoded words we should identify for the main categories given.
2. Function aggregates sentiment score per record and puts them into a dictionary

In [50]:
# Key words identifying main aspect categories manually extracted
aspects = {
    'Credit Starter':['repair', 'damaged','building'],
    'Customer Service':['human', 'computerized', 'banker', 'refund', 'experience', 'they','live person','talk','speaking','representative','manager', 'teller', 'banker', 'lady', 'customer', 'service', 'management', 'english', 'operation', 'agent','staff'],
    'Interest Rates': ['rate', 'interest', 'accrued'],
    'Online Banking Services': ['transactions', 'feature','website','bills', 'tool', 'monitor', 'app', 'platform', 'online', 'menu','automatic','payment'],
    'Rewards': ['bonus', 'offer', 'categorie','category', 'reward', 'point', 'mile', 'cash', 'back'],
    'Fees': ['fee', 'charge', 'charges','free', 'rebate', 'deal','discount'],
    'Security': ['fraud', 'dispute', 'security', 'seller'],
    'Retail Branch': ['branch', 'location', 'store', 'distance', 'branches', 'atm']
}

In [51]:
## Create function that will create main category scores
def aggregate_sentiment_scores(results:list[set], aspects:dict) -> list[dict]:
    """Aggregates sentiment scores into specified aspect categories

    Args:
        results (list[set]): list of ABSA results
        aspects (dict): dictionary of categories and key words matching each category

    Returns:
        list[dict]: list of dictionary with an aggregate score per review category
    """
    all_scores = list()
    # TODO: what if multiple mentions same aspect?

    for result in results:
        scores = {
            'Credit Starter':np.nan,
            'Customer Service': np.nan,
            'Interest Rates': np.nan,
            'Online Banking Services': np.nan,
            'Rewards':np.nan,
            'Fees':np.nan,
            'Security':np.nan,
            'Retail Branch': np.nan
        }
        # get result pair
        for pair in result:
            item = pair[0]
            # for aspect word in item
            for word in item.split():
                # for overall aspect categories
                for cat, words in aspects.items():
                    # check if word in result pair is in aspect words
                    if word.rstrip('s') in words:
                        # add score if none existent
                        if scores[cat] is np.nan:
                            scores[cat] = 0
                        scores[cat] = scores.get(cat, 0) + pair[2]
        all_scores.append(scores)
    return all_scores

In [52]:
# Standardize aspect scores into main categories
agg_scores = aggregate_sentiment_scores(results, aspects)

# Dataframe of Aspect Values
We create a master dataframe of records and add their corresponding selected scores

In [56]:
def create_df_from_spacy(corpus:textacy.Corpus, agg_scores:list[dict]) -> pd.DataFrame:
    """Creates a dataframe with the matching review, product, bank and aspect scores for each review. 
    These are later used to visualize overall scores for each bank and product

    Args:
        corpus (textacy.Corpus): corpus
        agg_scores (list[dict]): aspect scores for main categories

    Returns:
        pd.DataFrame: dataframe of review with accompanning score
    """
    records = list()
    for doc in corpus:
        base = doc._.meta
        base['text'] = doc.text
        records.append(base)
    # create dataframe of records
    df = pd.DataFrame(records)
    # creating aggregate score dataframe
    agg_scores = pd.DataFrame(agg_scores)
    # concatenating records and scores
    df = pd.concat([df, agg_scores], axis=1)
    return df

# Concatenating reviews and scores
df = create_df_from_spacy(corpus, agg_scores)

In [None]:
# Saving dataframe to CSV
df.to_csv('final_output.csv')

# Next Steps 
Our final steps for evaluation were conducted in the following fashion:
1. We manually scored our aspect extraction method by comparing the outputs from the algorithm versus the sample of annotated dataset
2. We used PowerBI to visualize the overall aspect category scores and draw insights and recommendations from our analysis