In [1]:
from collections import Counter
from wordcloud import WordCloud

import pandas as pd
import numpy as np
import pathlib
import re
import ipywidgets as widgets
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import textacy

# Sourcing Data

In [2]:
pathlib.Path( pathlib.os.getcwd(),'data')

PosixPath('/Users/jesidacosta/OneDrive - University of South Florida/ISM6930/group_project/data')

## Create Unified Dataframe

In [3]:
# Create Dataframe
columns =['date', 'name', 'user_id', 'verified', 'product', 'review_comment','stars', 'review']
df = pd.DataFrame(columns=columns)
# Insert JSON files into unified dataframe
for path in pathlib.Path(pathlib.os.getcwd(),'data').glob('*.json'):
    temp = pd.read_json(path)
    df = df.append(temp)

In [4]:
df.review.str.len().describe()

count    24732.000000
mean       496.538937
std        407.003654
min          2.000000
25%        177.000000
50%        435.000000
75%        664.000000
max       8744.000000
Name: review, dtype: float64

# Cleaning Data

In [5]:
# Looking number of Comments by Organization
mask = df.name.eq('WalletHub')
print('Content from site: {}'.format(df[mask].shape[0]))
df[mask].sort_values('date').tail()

Content from site: 70


Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
2,2016-09-28,WalletHub,WalletHub,False,TD Bank Credit Cards,False,4,The TD Bank Cash Visa Card is an average offer...
1,2020-04-20,WalletHub,WalletHub,False,,True,4,TD Bank Personal Loan ReviewTD Bank offers ine...
0,2020-04-20,WalletHub,WalletHub,False,PNC Personal Loans,False,5,PNC Personal Loan ReviewPNC personal loans hav...
1,2020-05-04,WalletHub,WalletHub,False,,True,4,Citibank Personal Loan ReviewCitibank personal...
0,2021-08-11,WalletHub,WalletHub,False,Wells Fargo Personal Loans,False,4,\nWells Fargo Personal Loan Review\n \nWells ...


In [6]:
# Look at non comments
mask = df.review_comment.eq(True)
print('Number of Replies: {}'.format(df[mask].shape[0]))
df[mask].sort_values('date').tail()

Number of Replies: 1143


Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
1406,2021-08-31,lisa,LisaInCali,False,,True,0,Amex is the same....reports 3 mos after openin...
78,2021-08-31,Thomas Garrison,tagarrisonllc,False,,True,0,"Don't be surprised if they, without any commun..."
18,2021-09-01,Rocky Grand,ullas_idol,False,,True,1,Beware of Horrible disputed transactions suppo...
306,2021-09-02,whuser16449284,whuser16449284,False,,True,0,Mine has no annual fee. It is $0.00
306,2021-09-02,arniejonassen,arniejonassen,False,,True,0,"Same here. It said I had a $5,000 credit limi..."


## Removing non-relevant samples

In [7]:
"""Removing comments and or content posted by Site"""
print('Size before {}'.format(df.shape[0]))
df = df[df.name.ne('WalletHub') & df.review_comment.ne(True)].reset_index(drop=True)
print('Size after {}'.format(df.shape[0]))

Size before 24732
Size after 23583


## Assigning Bank Labels

In [8]:
banks = ['Wells Fargo', 'Bank of America', 'Citibank', 'Chase', 
         'PNC', 'TD Bank', 'Capital One', 'U.S. Bank']

for bank in banks:
    df.loc[df['product'].str.contains(bank), 'bank'] = bank
df.head()

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review,bank
0,2021-09-08,katelyn,katelyn_leifert,True,PNC Credit Cards,False,5,The PNC cash rewards card is a great first cre...,PNC
1,2021-09-07,Jessica K,jessicak8652,False,PNC Mortgages,False,1,Stay as far away from this lender as possible....,PNC
2,2021-09-06,Doris,dorish_33,True,PNC Credit Cards,False,5,I like everything about it don't want to chang...,PNC
3,2021-09-02,Virgil,virgilw_7,True,PNC Credit Cards,False,5,Good card....0 percent interest for first year...,PNC
4,2021-09-02,shloymie,shloymie,False,PNC Business Services,False,2,Local branch very Hard to reach by phone. Gene...,PNC


In [9]:
# Look at samples per bank
df.bank.value_counts()

Capital One        8585
Chase              4025
Citibank           3419
Bank of America    2797
Wells Fargo        1838
U.S. Bank          1054
PNC                 983
TD Bank             882
Name: bank, dtype: int64

## Cleaning Product Categories

In [10]:
# Replacing names from products
pattern = re.compile(" |".join(banks) + " ")

df['product'] = df['product'].apply(lambda text: re.sub(pattern=pattern, repl="", string=text))

In [11]:
df['product'].value_counts()

Credit Cards         17193
Checking              3965
Car Loans              758
Savings & CDs          663
Mortgages              389
Business Services      262
Prepaid Cards          105
Personal Loans          87
Home Equity             49
Savings                 48
Student Loans           33
Investments             22
CDs                      7
CD Rates                 1
Conventional             1
Name: product, dtype: int64

In [12]:
# Checking CDs
def convert_to_product(old_product:str, new_product:str) -> None:
    """Converts old product to a new or already exiting product category

    Args:
        old_product (str): name of old product to be converted
        new_product (str): new or existing new to product to be assigned
    """
    # affected rows
    rows = df[df['product'].eq(old_product)].shape[0]
    rows_new = df[df['product'].eq(new_product)].shape[0]
    print(f'Before -> Rows {rows} in {old_product}, {rows_new} in {new_product}')

    # move to new category
    df.loc[df['product'].eq(old_product), 'product'] = new_product

    # affected rows
    rows = df[df['product'].eq(old_product)].shape[0]
    rows_new = df[df['product'].eq(new_product)].shape[0]
    print(f'After  -> Rows {rows} in {old_product}, {rows_new} in {new_product}')

# Moving Conventional
convert_to_product('Conventional', 'Credit Cards')

# Moving CDs
convert_to_product('CDs', 'Savings & CDs')

# Moving CD Rates
convert_to_product('CD Rates', 'Savings & CDs')

# Moving Savings
convert_to_product('Savings', 'Savings & CDs')

Before -> Rows 7 in CDs, 663 in Savings & CDs
After  -> Rows 0 in CDs, 670 in Savings & CDs
Before -> Rows 1 in Conventional, 17193 in Credit Cards
After  -> Rows 0 in Conventional, 17194 in Credit Cards
Before -> Rows 1 in CD Rates, 670 in Savings & CDs
After  -> Rows 0 in CD Rates, 671 in Savings & CDs
Before -> Rows 48 in Savings, 671 in Savings & CDs
After  -> Rows 0 in Savings, 719 in Savings & CDs


## Removing small sample products

In [13]:
# REMOVE product categories with counts less than 250 --> remaining products would be 6
df = df.groupby('product').filter(lambda x: len(x) > 250)

# See new counts of remaining products
df['product'].value_counts()

Credit Cards         17194
Checking              3965
Car Loans              758
Savings & CDs          719
Mortgages              389
Business Services      262
Name: product, dtype: int64

# Sub-sampling for Analysis

In [14]:
# Selecting small data slice for testing
SUB_SAMPLE = False

if SUB_SAMPLE:
    df = df.sample(frac=.20)

print('Data size: {} rows'.format(df.shape[0]))

Data size: 23287 rows


# Saving Text Corpus to Textacy Corpus

In [15]:
# spacy library
library = 'en_core_web_sm'

# initializing nlp
nlp = spacy.load(library)

In [16]:
# TODO: Look at ways of saving as spacy docs
# Solution: https://textacy.readthedocs.io/en/0.11.0/api_reference/lang_doc_corpus.html


## function text to spacy/textacy corpus
def create_corpus(df:pd.DataFrame, library:str='en_core_web_sm') -> textacy.Corpus:
    """Creates a textacy corpus with a spacy doc and meta `{'date', 'bank', 'product', 'stars'}`

    Args:
        df (pd.DataFrame): dataframe with reviews information
        library (str, optional): spacy pipeline used as for text processing. Defaults to 'en_core_web_sm'.

    Returns:
        textacy.Corpus: Set of spacy docs and meta
    """
    # converting date to string for serialization
    df['date'] = df['date'].astype(str)

    # organize dictionary to format ['review', {meta:1, ...}]
    # convert pandas to dictionary only of meta columns
    columns = ['date', 'bank', 'product', 'stars']
    meta = df[columns].to_dict(orient='records')
    # convert reviews series to list of reviews
    reviews = df['review'].tolist()
    # zip into records (review, {meta})
    records = list(zip(reviews, meta))
    # creating base corpus
    corpus = textacy.Corpus(lang=library)
    # adding records (batch and n_process speeds loading)
    corpus.add_records(records=records, batch_size=50, n_process=-1)
    return corpus

# creating corpus
# corpus = create_corpus(df=df, library=library)


## Saving

In [17]:
# Saving 
# corpus.save('./data/bank_reviews.bin.gz')

## Loading

In [18]:
# Loading 
corpus = textacy.Corpus.load(lang=library, filepath='./data/bank_reviews.bin.gz')
print(corpus)

Corpus(23287 docs, 2495582 tokens)


## Usage of Corpus and docs/meta

In [19]:
# Testing selecting documents
filter_func = lambda doc: doc._.meta['bank'] == 'Bank of America' and doc._.meta['product'] == 'Business Services'
counter = 1
for doc in corpus.get(filter_func, limit=18):
    if counter == 17:
        print(doc.text, doc._.meta)
    counter += 1

B of A is the worst bank I have ever dealt with. I have numerous accounts there including my buisness and they are the most unhelpful bank in the buisness. I have been trying to close my accounts and move them elsewhere since before Christmas however there is never anyone in the bank? (Closed) who knows. The other branches around me also have no one working in them? No signs of when they will return. CUSTOMERS BEWARE STAY AWAY AND FIND ANOTHER BANK {'date': '2021-01-12', 'product': 'Business Services', 'stars': 1, 'bank': 'Bank of America'}


In [20]:
# Function to return
def clean_words(doc:spacy.tokens.doc.Doc, excluded:list=[], pos='None') -> list:
    """Cleans text into a list of words removing punctuations, stopwords, numbers and list of excluded words.

    Args:
        doc (spacy.tokens.doc.Doc): Spacy text doc
        excluded (list, optional): list of words to exclude. Defaults to [].
        pos (str, optional): Part of speech to select. Defaults to 'None'.

    Returns:
        list: cleaned words 
    """
    if pos == 'None':
        words = [token.lemma_.lower() for token in doc if (not token.is_punct and 
                                            not token.is_stop and 
                                            token.pos_ != 'NUM' and
                                            token.lemma_ not in excluded)]
    else:
        words = [token.lemma_.lower() for token in doc if (not token.is_punct and 
                                                   not token.is_stop and 
                                                   token.lemma_ not in excluded and
                                                   token.pos_ != 'NUM' and
                                                   token.pos_ == pos)]    # conditional for POS type
    return words
    

In [25]:
# Function to clean Documents for selected bank and product, extracting POS feature selected
pos = None
bank = 'Bank of America'
product = 'Business Services'

def clean_docs(product:str, bank:str='All', pos:str='None', corpus:textacy.Corpus=corpus) -> list:
    """Generates a filter for the bank-product selected, a list of excluded words and iteratively cleans spacy docs in the given corpus.

    Args:
        product (str): bank product to be cleaned
        bank (str, optional): bank selected. Defaults to 'All'.
        pos (str, optional): part of speech selected . Defaults to 'None'.
        corpus (textacy.Corpus, optional): bank corpus to be cleaned. Defaults to corpus.

    Returns:
        list: cleaned words for all documents of given bank-product
    """
    # Create empty list of words
    words = list()
    # filter review type by product & bank
    if bank == 'All':
        filter_func = lambda doc: doc._.meta['product'] == product                                  # all banks
    else:
        filter_func = lambda doc: doc._.meta['product'] == product and doc._.meta['bank'] == bank   # show only selected bank
    # Update excluded words
    excluded = [' ', 'bank', 'account', 'tell', 'year', 'day', '$','hold', 'know', 'go']
    # Add bank name and product name from excluded words
    excluded += [name for name in bank.split()] + [name.rstrip('s') if name not in ('business', 'services') else name for name in product.lower().split()]
    if bank == 'Bank of America': 
        excluded += ['boa']
    # filtering docs
    selected_docs = corpus.get(filter_func)
    # iterate through docs
    for doc in selected_docs:
        top_words = clean_words(doc, excluded=excluded, pos=pos)
        # apped to list of all docs
        words.extend(top_words)
    return words

# example words
clean_docs(product)[:5]

['local', 'branch', 'hard', 'reach', 'phone']

# Visualize Word Distributions

## Bar Chart: Top Words/Nouns/Adjs

In [23]:
# 1. create word count chart

# List of unique products
products = df['product'].unique().tolist()

# Creating Dropdown menu with products
product_dropdown = widgets.Dropdown(
    options= [prod for prod in products],
    value='Credit Cards',
    description='Bank Prod:')

# Creating Dropdown menu with banks
bank_dropdown  = widgets.Dropdown(
    options= banks + ['All'],
    value='All',
    description='Bank:')

# Function to update bar chart
def update_plot(bank:str, product:str, pos:str='NOUN') -> None:
    """Plots top 10 words or POS mentioned in the reviews for a given bank-product
        - product: selected feature from drop down

    Args:
        bank (str): bank from drop down.
        product (str): selected product from drop down. Defaults to 'NOUN'
        pos (str): part of speech selected.
    """
    # fetch clean words
    words = clean_docs(product=product, bank=bank, pos=pos)
    # get top words as dict
    words_dict = {wc[0]:wc[1] for wc in Counter(words).most_common(10)}
    # words_dict = reviews[product]['words']
    plt.figure(figsize=(20,8))
    plt.style.use("dark_background")
    sns.barplot(x=list(words_dict.keys()),
    y=list(words_dict.values()))
    name = pos.capitalize() if pos != 'None' else 'Word'
    plt.title(f'Top 10 {name}s for {product}')
    plt.ylabel('Counts')
    plt.xlabel(name + 's')
    plt.show()

# Shows interactive plot to select feature and plot top 10 hotels
widgets.interactive(update_plot, bank=bank_dropdown, product=product_dropdown, pos=[ 'NOUN', 'ADJ','None'])

interactive(children=(Dropdown(description='Bank:', index=8, options=('Wells Fargo', 'Bank of America', 'Citib…

## Word Cloud for Words/Nouns/Adj

In [24]:
# Creating Dropdown menu with products
product_dropdown = widgets.Dropdown(
    options= products,
    value='Credit Cards',
    description='Bank Prod:')

# Creating Dropdown menu with banks
bank_dropdown  = widgets.Dropdown(
    options= banks + ['All'],
    value='All',
    description='Bank:')


def make_cloud(bank:str, product:str, pos:str='NOUN',  word_max:int=100) -> None:
    """Creates a word cloud for given bank, product and POS with a maximun number of words selected.

    Args:
        bank (str): bank selected
        product (str): product selected
        pos (str, optional): part of speech seelcted. Defaults to 'NOUN'.
        word_max (int, optional): maximun number of words to view. Defaults to 100.
    """
    # get product text
    # text = reviews[product]['text']
    # clean text
    cleaned_text = ' '.join(clean_docs(product, bank, pos=pos))
    plt.figure(figsize=(20,8))
    wordcloud = WordCloud(background_color='white', max_words=word_max).generate(cleaned_text)
    plt.style.use("dark_background")
    name = pos.capitalize() + 's' if pos != 'None' else 'Words'
    plt.title(f'Most used {name}: {bank}, {product}', fontsize=32)
    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")
    

widgets.interact(make_cloud, bank=bank_dropdown, product=product_dropdown, pos=['NOUN', 'ADJ', 'None'], word_max=[20,50,100,200])
# -> Make these interactive

interactive(children=(Dropdown(description='Bank:', index=8, options=('Wells Fargo', 'Bank of America', 'Citib…

<function __main__.make_cloud(bank: str, product: str, pos: str = 'NOUN', word_max: int = 100) -> None>

# Preliminary Analysis

## Time-Varying most common noun/adj
Which is the most common noun/adj per year?

## Programatically extracting Aspect/Modifier(Opinion) - Jesid
- inspiration: https://achyutjoshi.github.io/aspect_extraction/aspectextraction
- code: https://github.com/ishikaarora/Aspect-Sentiment-Analysis-on-Amazon-Reviews/blob/master/src/models/aspect_extraction.py
- linguistic base: https://universaldependencies.org/en/dep/index.html

# Next Steps

## Inspect for Labelling Libraries/Packages - Rupesh
- i.e. prodigy: https://github.com/explosion/prodigy-recipes

## Models (examples below) to attempt: - Taty
- Cat(RBF Kernel/Aspect Extraction): https://arxiv.org/abs/2004.13580
- BERT-Based:
- Other: