# Webscraping Bank Product Reviews
BoA Datasource = https://wallethub.com/profile/bank-of-america-13000450i

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import  Options
import pandas as pd
import time
import datetime
from random import uniform, choice



In [2]:
def get_proxies():
    url = "https://www.us-proxy.org//"
    # get the HTTP response and construct soup object
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    proxies = []
    for row in soup.find("table").findAll("tr"):
        tds = row.find_all("td")
        # check allows https and is elite class
        try:
            # print(tds[6], tds[4])
            if tds[6].text == 'yes' and tds[4].text == 'elite proxy':
                ip = tds[0].text.strip()
                port = tds[1].text.strip()
                host = f"{ip}:{port}"
                proxies.append(host)
        except IndexError:
            continue
    return proxies

def select_proxy(proxies):
    proxy = choice(proxies)
    return proxy

In [3]:
def create_driver_session(bank="Wells Fargo",headless=True):
    # Get Free Proxy
    proxies = get_proxies()
    proxy = select_proxy(proxies) # IP:PORT or HOST:PORT
    print('Proxy IP', proxy)
    # Set Proxy address to fetch new data
    options = Options()
    options.add_argument('--proxy-server=%s' % proxy)
    # No need to see page
    options.headless = headless
    # Create driver
    driver = webdriver.Chrome(DRIVER_PATH, options=options)
    # Testing it is functioning URL
    try:
        driver.implicitly_wait(10)
        driver.get(url)
    except Exception as e:
        print('Driver Creation Failed')
        driver.quit()
        create_driver_session(bank=BANK,headless=False)
    if bank not in driver.title:
        print("Did not reach review page - blocked")
        driver.quit()
        create_driver_session(bank=BANK,headless=False)
    else:    
        return driver

In [4]:
def get_html_file(url, driver, page=None):
    if page != None:
        url += "?p=" + page
    # inserting random time lapse to try to prevent bot detection
    random_value = uniform(2.5,7.0)
    print('Waited', random_value, 'seconds')
    time.sleep(random_value) 
    # fetch website
    driver.get(url)
    # get html
    soup = BeautifulSoup(driver.page_source, 'lxml', from_encoding='UTF-8')
    return soup

In [5]:
def fetch_page_reviews(html, df):
    for review in html.find_all(class_='rvtab-citem'):  
        # Initialize Values
        stars = 0
        verified = True
        review_comment = False
        # Name & UserName
        author_name = review.find(class_='rvtab-ci-name').text.strip()
        nickname = review.find(class_='rvtab-ci-nickname').text[1:]
        # Date of Review
        date = review.find('time')['datetime']
        # Product Reviewed
        if review.find(class_='rvtab-ci-category') is not None:
            product = review.find(class_='rvtab-ci-category').text.replace('Product:','').strip()
        else:
            # prints comments that don't belong to specific product, probably comments to previous review
            print(author_name, nickname, date, verified, stars)
            review_comment = True
            product = None
        # Review Comment
        comment = review.find(class_='rvtab-ci-content').text
        # Count number of stars
        for star in review.find_all('path'):
            if star['fill'] == '#4ae0e1':
                stars += 1
        if review.find(class_='rvtab-ci-verified') is None:
            verified = False
            # print(author_name, nickname, date, verified, stars)
            # print(product)
            # print('\t', comment)
        # temporary to add row by row
        row = [[date, author_name, nickname, verified, product, review_comment, stars, comment]]
        temp = pd.DataFrame(row, columns=df.columns)
        # add to main dataframe
        df = df.append(temp, ignore_index=True)
    return df

In [6]:
# TODO:
# 2 potential breaks: 1) if too many requests 2) no windown exists exception
# IF continues to crash try the clicking option instead of a new request

In [7]:
def scrape_reviews(url, driver, start_page=None, data=None):
    if data is None:
        # Create DataFrame for storage
        col_names = ['date', 'name', 'user_id', 'verified', 'product', 'review_comment', 'stars', 'review']
        data = pd.DataFrame(columns=col_names)
    # attempts counter
    attempts = 0
    # fetch starting page
    html = get_html_file(url, driver)
    # get page positions
    page_position = html.find(class_='rvtab-pag-pos').text
    if start_page is None:
        current_page = page_position.split()[0]
    else:
        current_page = str(start_page)
    last_page = page_position.split()[2]
    while int(current_page) <= int(last_page):
        print(current_page)   
        try:
            html = get_html_file(url, driver, current_page)
        except Exception as e:
            # if attempts > 3:
            #     break
            print('Exception trigerred:', e)
            break
        #     driver = create_driver_session(bank=BANK, headless=False)
        #     attempts += 1
        # check comment review exists in page - else bot may be blocked
        if html.find_all(class_='rvtab-citem'):
            # add page reviews
            data = fetch_page_reviews(html, data)
            # get new page number
            current_page = html.find(class_='rvtab-pag-pos').text.split()[0]
            # increase page
            current_page = str(int(current_page) + 1)
        elif html.title.text == 'IP Block':
            print('No more comments retrieved - Bot Blocked')
            # print('Creating driver with new IP address')
            # driver = create_driver_session(bank=BANK, headless=False)
            # if attempts > 3:
            break
            # attempts += 1
    # driver.close()
    return data, current_page

In [106]:
# Enter Bank Name
BANK = "PNC"

In [107]:
banks = {
    "Wells":"wells-fargo-13007950i",
    "America":"bank-of-america-13000450i",
    "Citibank":"citibank-13001291i",
    "Chase":"chase-13001251i",
    "Capital":"capital-one-13001087i",
    "US": "us-bank-13007637i",
    "PNC": "pnc-13005045i",
    "TD": "td-bank-13006307i"
}
# Select Bank url location
selected_bank = banks[BANK]

In [108]:
# Create Final URL
root_url = "https://wallethub.com/profile/" 
url = root_url + selected_bank
# Use Driver to fetch website
DRIVER_PATH = "/Users/jesidacosta/OneDrive - University of South Florida/ISM6930/group_project/chromedriver"

In [136]:
# Create Headless Session (not GUI)
options = Options()
proxies = get_proxies()
proxy = select_proxy(proxies) # IP:PORT or HOST:PORT
options.add_argument('--proxy-server=%s' % proxy)
options.headless = False
# Create Driver
driver = webdriver.Chrome(DRIVER_PATH, options=options)
print('Proxy IP:', proxy)
driver.get(url)

# # Fetch HTML using Beautiful Soup
# time.sleep(1)
# # soup = BeautifulSoup(driver.page_source, 'lxml', from_encoding='UTF-8')

# # for review in soup.find_all(class_='rvtab-citem'):
# #     author_name = review.find(class_='rvtab-ci-name').text.strip()
# #     date = review.find('time')['datetime']
# #     comment = review.find(class_='rvtab-ci-content').text
# #     print(date, author_name, comment, '\n')

Proxy IP: 162.155.10.150:55443


In [None]:
# TODO: Need to fix this function
# driver = create_driver_session(bank=BANK,headless=False)
# driver.get(url)

In [137]:
data, page = scrape_reviews(url,driver)
data

Waited 6.267297258820578 seconds




1
Waited 3.257563875532276 seconds
2
Waited 2.5271500451768056 seconds
3
Waited 6.925041524740107 seconds
4
Waited 6.716167079412637 seconds
5
Waited 5.103792737497061 seconds
6
Waited 5.820706955708312 seconds
7
Waited 3.472978951221789 seconds
8
Waited 5.664747401730209 seconds
9
Waited 3.5847385861678362 seconds
Mercedes Lawrence alawrence_1828 2021-05-10 True 0
tameekadouglas001 tameekadouglas001 2021-05-11 True 0
10
Waited 5.647820056937355 seconds
11
Waited 6.921718612513953 seconds
bob_dowd58 bob_dowd58 2021-05-12 True 0
12
Waited 3.4714782095933434 seconds
13
Waited 4.999712845092493 seconds
robinc49 robinc49 2021-04-19 True 0
14
Waited 5.740188106765342 seconds
15
Waited 6.411616425625177 seconds
16
Waited 3.4645784115167464 seconds
17
Waited 4.85737986357173 seconds
18
Waited 6.6671546914701665 seconds
19
Waited 6.314862432390676 seconds
20
Waited 6.672205167467997 seconds
Alex Truth alexandrum1 2020-11-17 True 0
Amy Zhou amyzhou007 2020-12-28 True 0
21
Waited 6.9742306956477

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2020-04-20,WalletHub,WalletHub,False,PNC Personal Loans,False,5,PNC Personal Loan ReviewPNC personal loans hav...
1,2021-09-08,katelyn,katelyn_leifert,True,PNC Credit Cards,False,5,The PNC cash rewards card is a great first cre...
2,2021-09-07,Jessica K,jessicak8652,False,PNC Mortgages,False,1,Stay as far away from this lender as possible....
3,2021-09-06,Doris,dorish_33,True,PNC Credit Cards,False,5,I like everything about it don't want to chang...
4,2021-09-02,Virgil,virgilw_7,True,PNC Credit Cards,False,5,Good card....0 percent interest for first year...
...,...,...,...,...,...,...,...,...
1055,2012-05-30,Kevin,GDigital,False,PNC Checking,False,4,I've been a long time customer of PNC Bank and...
1056,2012-05-30,sujatha swaminathan,sujathaswami,False,PNC Checking,False,4,I have had nothing but great experiences with ...
1057,2012-05-17,Rev Wilson,bnkrgrl,False,PNC Checking,False,3,I opened an account at Compass about 10 years ...
1058,2012-05-17,Alanna,alannajean,False,PNC Checking,False,5,I opened an account with PNC Bank after closin...


In [104]:
data, page = scrape_reviews(url,driver,page,data)
data

Waited 6.7706048892544635 seconds




39
Waited 3.979206742562596 seconds
trevans159 trevans159 2020-03-04 True 0
40
Waited 2.535209508975362 seconds
Stephen Baker rslbaction 2020-02-18 True 0
Dan Schlemper danschlemp 2020-02-13 True 0
41
Waited 4.488455364654619 seconds
Patrick patricko_26 2020-02-05 True 0
42
Waited 2.814566065395137 seconds
Raphael Bernstein mesivta 2020-01-21 True 0
43
Waited 6.878730143420604 seconds
44
Waited 3.9958591449765826 seconds
no thanks stopbigdata 2020-01-14 True 0
45
Waited 6.072472314160187 seconds
46
Waited 3.3807725028015465 seconds
Travis travisb_32 2021-02-12 True 0
47
Waited 2.9968702188113285 seconds
48
Waited 5.374724475711078 seconds
49
Waited 4.354857156269487 seconds
50
Waited 2.8319831290427153 seconds
averyjocelyn19991 averyjocelyn19991 2019-07-10 True 0
51
Waited 5.9377817877917956 seconds
Peen Winkle nwttp 2019-04-03 True 0
Peen Winkle nwttp 2019-04-03 True 0
52
Waited 6.383899270170953 seconds
iatwoodf iatwoodf 2019-09-04 True 0
53
Waited 4.309072786011747 seconds
michael w

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2021-09-08,Dora,doram3,True,U.S. Bank Credit Cards,False,5,This card is great just I will be keeping got...
1,2021-09-07,bcpatterson1127,bcpatterson1127,False,U.S. Bank Checking,False,1,US Bank's fraud prevention is a joke. They ha...
2,2021-09-07,amrita_panda,amrita_panda,False,U.S. Bank Mortgages,False,1,I would give 0 or negative stars if there was ...
3,2021-09-04,John Chambers,johnc5,True,U.S. Bank Mortgages,False,3,We paid off this loan in Aug. 2021 . While the...
4,2021-09-03,aaronsmith123,aaronsmith123,False,U.S. Bank Car Loans,False,1,We had a lease with US Bank that we bought out...
...,...,...,...,...,...,...,...,...
1124,2012-02-08,Joe Weider,joe_weider,False,U.S. Bank Credit Cards,False,4,The “Cache” in Cache Credit Card refers to the...
1125,2012-02-06,Joe Weider,joe_weider,False,U.S. Bank Credit Cards,False,2,If you’re in the construction industry or love...
1126,2012-01-11,Joe Weider,joe_weider,False,U.S. Bank Credit Cards,False,2,"If you’re looking to give back, the Children’s..."
1127,2011-12-20,Joe Weider,joe_weider,False,U.S. Bank Credit Cards,False,4,It hasn’t been the best time ever recently to ...


In [78]:
data, page = scrape_reviews(url,driver,page,data)
data

Waited 4.735779158282348 seconds




614
Waited 4.122348063790736 seconds
615
Waited 4.672196379902002 seconds
Trashawn Price trashawnp 2017-11-07 True 0
Ashley Ferguson ashleyf_8 2016-01-09 True 0
Les Cooperman lesc 2015-07-23 True 0
Odysseas  Papadimitriou odysseas 2015-08-25 True 0
Tom Jensen tomj5 2015-09-09 True 0
616
Waited 3.881679963691984 seconds
617
Waited 5.533823891937255 seconds
618
Waited 5.478462556916119 seconds
Ronnie W Wilson ronniew_9 2018-09-24 True 0
619
Waited 2.846685078375131 seconds
Tom Jensen tomj5 2015-09-09 True 0
620
Waited 3.4707963506188344 seconds
621
Waited 5.710418911865451 seconds
622
Waited 6.747486298681603 seconds
623
Waited 5.840682849727132 seconds
Sound  Judgment Soundjudgment 2016-06-18 True 0
624
Waited 4.742898184589604 seconds
Jennifer Smith jennifers_60 2015-07-06 True 0
Les Cooperman lesc 2015-07-23 True 0
625
Waited 4.793405784453603 seconds
Ellyn Polay ellynp 2015-06-07 True 0
626
Waited 5.691812111927259 seconds
627
Waited 5.553813787426268 seconds
Erica Stowers ericas24 2

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2021-09-08,Dolly Kagawa,dollyk_1,True,Capital One Car Loans,False,1,Interest rate is too high! The car payment is ...
1,2021-09-08,Aleta,aletaschmidt,True,Capital One Credit Cards,False,5,Great customer service. Give credit increases ...
2,2021-09-08,Jon Juan,JonJuan,False,Capital One Checking,False,1,I really thought that this bank is a better on...
3,2021-09-07,Shane,shanelinton0,True,Capital One Credit Cards,False,5,"able to use everywhere, this was a great credi..."
4,2021-09-07,Jessica,jessicalanethompson,True,Capital One Credit Cards,False,5,Great credit builder card with rewards. Credit...
...,...,...,...,...,...,...,...,...
8208,2012-11-23,Joelle Rodeen,joeller,False,Capital One Credit Cards,False,3,Capital One has provided me with solid custome...
8209,2012-11-23,Brandon Harvey,brandon.harvey.169,False,Capital One Credit Cards,False,5,I basically got this to build up my credit. Th...
8210,2012-11-23,Al Roker,al.roker.3,False,Capital One Credit Cards,False,5,Me and my wife choose this card as a way to bu...
8211,2012-11-23,Joe III,joe.charltoniii,False,Capital One Credit Cards,False,5,Using this card has been a real pleasure for m...


In [111]:
page

'405'

In [115]:
data.head(20)

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2016-09-21,WalletHub,WalletHub,False,Chase Credit Cards,False,3,If you have good or excellent credit and a fam...
1,2019-10-03,Adam McCann,adam_mccann,False,Chase Credit Cards,False,5,Sapphire Preferred is the card of choice for p...
2,2020-09-25,Andron Rabbit,andron311,False,,True,0,Until they drop your credit limit down to $500...
3,2020-10-30,ADAM SOMERS,adamksomers,False,,True,0,@andron311 Crazy. I know ..right. I have been ...
4,2021-01-08,Clarissa F,clari,False,,True,0,@andron311 same thing happened to me right aft...
5,2021-06-24,Samuel Arroyo,samuel_arroyo26,False,,True,0,All Chase knows how to do is screw customers a...
6,2021-09-06,Margaret,margareth_107,True,Chase Credit Cards,False,5,I love that they monitor your card that way no...
7,2021-09-05,Rebekah,chennoa,True,Chase Credit Cards,False,5,Chase has been very good to me and I will use ...
8,2021-09-05,Thelma,thelmag_7,False,Chase Credit Cards,False,5,I really enjoy the convenience of this card. I...
9,2021-09-05,LAURA,releankanaki,True,Chase Credit Cards,False,5,Chase has given me a chance to obtain a Visa S...


In [138]:
# Save Document as Pandas
# DATE = datetime.datetime.now().date()
# data.to_pickle("wellsfargo_wallethub_reviews_{}.pkl".format(DATE))
data.to_json('data/{}_wallethub_reviews.json'.format(BANK.lower()), orient='records')
# data.to_csv("wellsfargo_wallethub_reviews_{}.csv".format(DATE), index=False, encoding='utf-8')

In [139]:
# Encoding seems to be a problem in excel not in 
data.iloc[26,:].review

'They charged me for their mishandling of a dealer loan payoff. PNC gave the new car dealer a payoff amount (good for a certain number of days) the dealer sent a check for this amount and due to an error on the part of someone at PNC the check got returned to the dealer, time went by, payoff amount changed and they took this money from MY bank acct. They are IDIOTS and THEIVES and refuse to refund the money they stole.'

In [142]:
df = pd.read_json("data/{}_wallethub_reviews.json".format(BANK), orient='records')
df

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2020-04-20,WalletHub,WalletHub,False,PNC Personal Loans,False,5,PNC Personal Loan ReviewPNC personal loans hav...
1,2021-09-08,katelyn,katelyn_leifert,True,PNC Credit Cards,False,5,The PNC cash rewards card is a great first cre...
2,2021-09-07,Jessica K,jessicak8652,False,PNC Mortgages,False,1,Stay as far away from this lender as possible....
3,2021-09-06,Doris,dorish_33,True,PNC Credit Cards,False,5,I like everything about it don't want to chang...
4,2021-09-02,Virgil,virgilw_7,True,PNC Credit Cards,False,5,Good card....0 percent interest for first year...
...,...,...,...,...,...,...,...,...
1055,2012-05-30,Kevin,GDigital,False,PNC Checking,False,4,I've been a long time customer of PNC Bank and...
1056,2012-05-30,sujatha swaminathan,sujathaswami,False,PNC Checking,False,4,I have had nothing but great experiences with ...
1057,2012-05-17,Rev Wilson,bnkrgrl,False,PNC Checking,False,3,I opened an account at Compass about 10 years ...
1058,2012-05-17,Alanna,alannajean,False,PNC Checking,False,5,I opened an account with PNC Bank after closin...


In [45]:
df.iloc[22].comment

'The card has a $39 per year fee. I’ve had it almost 7 years still a low limit - $2000. My other cards have MUCH higher limits. I have great credit.  I keep it to use for internet purchases, it’s easily monitored and keeps my other higher limit cards safe from the web.'

In [162]:
data

Unnamed: 0,date,name,user_id,verified,product,review_comment,stars,review
0,2021-08-11,WalletHub,WalletHub,False,Wells Fargo Personal Loans,False,4,\nWells Fargo Personal Loan Review\n \nWells ...
1,2020-01-04,aaron saxton,aaron_e_saxton,False,,True,0,Obviously written by a marketing company. Not ...
2,2021-07-28,Kimberly Askew,kimberlyaskew,False,,True,0,Agreed company is horrible and shouldn't be al...
3,2021-09-01,Robert,emmyandaj,True,Wells Fargo Credit Cards,False,4,One of my first cards I ever got. They gave m...
4,2021-09-01,Keynan,jordanmathewphillips,True,Wells Fargo Credit Cards,False,5,Thank you for making it easy and wanting to be...
...,...,...,...,...,...,...,...,...
1901,2012-05-29,Dennis Brenes,DBrenes67,False,Wells Fargo Credit Cards,False,4,"As a college student, I use this card a lot, e..."
1902,2012-05-17,natalie,natalia711,False,Wells Fargo Checking,False,3,My first checking account was with Wells Fargo...
1903,2012-05-17,Becky Dillman,becky_dillman_7,False,Wells Fargo Checking,False,4,I have been a long time customer of Wells Farg...
1904,2012-05-15,Trisha,Mariposa77,False,Wells Fargo Checking,False,3,I opened an account with Wells Fargo just over...
