In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Scrape data from Yellow Pages

Yellow Pages contains business information and can be used to find bank branches by ZIP Code. We did not end up using this script in the analysis as the run time took too long.

## Get ZIP Codes

In this script, we'll use the Census Bureau ZCTA to County Relationship [File](https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt) to get a list of national ZIP Codes.

Note: The Census County Relationship file lists 46,960 ZCTAs (ZIP Code equivalents). Scraping business data from Yellow Pages for a particular ZIP code takes between 5-10 seconds. Thus, scraping this data for all ZIP codes in the U.S. will take anywhere from 2 to 5 days. For this reason, we chose not to proceed with this method of data gathering for our analysis. Instead, we have scraped data for Philadelphia County for illustrative purposes.


In [2]:
zip_county = pd.read_csv('https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt', sep='|')
# print(zip_county.sample(5))

# drop counties without zip codes
zip_county.dropna(inplace = True)

# create list of zips
zips = zip_county.loc[:,"GEOID_ZCTA5_20"].astype(str).str[:-2].str.zfill(5).tolist()
print(f"There are {len(zips)} ZIP Codes listed nationally in the Census Bureau ZCTA to County Relationship file.\n")

# create a list of zips in Philadelphia County
zips_Philly = zip_county.loc[zip_county['NAMELSAD_COUNTY_20'] == 'Philadelphia County']
zips_Philly = zips_Philly.loc[:,"GEOID_ZCTA5_20"].astype(str).str[:-2].str.zfill(5).tolist()
print(f"We will scrape data for the {len(zips_Philly)} ZIP Codes listed in Philadelphia County.")

There are 46960 ZIP Codes listed nationally in the Census Bureau ZCTA to County Relationship file.

We will scrape data for the 49 ZIP Codes listed in Philadelphia County.


## Define function to scrape Yellow Pages

In [3]:
def scrapeYellowPages(search_term: str, search_zip: str, search_page: int = 1):
    '''
        Function scrapes business data from yellowpages.com

        Args:
            search_term (str): type of business to search in 'Find a business' bar
            search_zip (str): ZIP code to search in 'Where?' bar
            search_page (int): results page to scrape; default 1

        Return:
            business_df: Dataframe populated with results from your search ZIP Code
                with the following fields:
                    - Name
                    - Address
                    - City
                    - State
                    - Zip
            check_next_page (bool)
            
    '''
    # base url
    url = 'https://www.yellowpages.com/search'

    # search parameters
    params = {
        "search_terms": search_term,
        "geo_location_terms": search_zip,
        "page": search_page
    }

    # create df to hold results
    business_df = pd.DataFrame(columns = ['Name', 'Address', 'City', 'State', 'Zip'])
    
    # flag to determine whether next page should be checked for additional results
    check_next_page = 1

    # flag for error in search
    error_flag = 0
    
    try:
        # request page
        page = requests.get(url, params = params)
    
        # parse html
        soup = BeautifulSoup(page.content, 'html.parser')
    
        # find results on webpage
        search_results_organic = soup.find("div", class_="search-results organic")
        results = search_results_organic.find_all("div", class_ = "result")    
    
        # iterate through results
        for result in results:
            business_name = result.find("a", class_ = 'business-name').text
            street_address = None
            city = None
            state = None
            zipcode = None
        
            # skip ATMS
            if 'ATM' in business_name:
               continue
        
            # if street address exists, extract data
            if result.find("div", class_ = 'street-address'):
                street_address = result.find("div", class_ = 'street-address').text
        
            # if locality exists, extract data        
            if result.find("div", class_ = 'locality'):
                locality = result.find("div", class_ = 'locality').text
                city  = locality.split(',')[0]
                state_zip = locality.split(',')[1]
                state_zip = state_zip.strip()
                state = state_zip.split(' ')[0]
                zipcode = state_zip.split(' ')[1]
       
            # determine whether business is in correct geography
            if search_zip == zipcode:
                business_df.loc[len(business_df)] = [business_name, street_address, city, state, zipcode]
                check_next_page = 1
            else:
                check_next_page = 0
    except:
        error_flag = 1
    
    return business_df, check_next_page, error_flag

## Scrape data for each ZIP Code

In [4]:
# df to hold results
results_df = pd.DataFrame(columns = ['Name', 'Address', 'City', 'State', 'Zip'])

# list to hold misssing zips
errored_zips = []

# iterate through all zips
for z in zips_Philly: 
    # initialize flags
    check_next_page = 1
    search_page = 1
    error_flag = 0
    
    while check_next_page == 1:
        # capture results
        df, check_next_page, error_flag = scrapeYellowPages('banks', z, search_page)
        time.sleep(1)

        # append df to results_df
        results_df = pd.concat([results_df, df], ignore_index = True)

        # track errored zips
        if error_flag:
            errored_zips.append(z)
        
        # increment search page
        search_page += 1


print(f'{len(results_df)} banks found in {len(zips_Philly)} zip codes.')      



KeyboardInterrupt



## Export Results

In [None]:
os.chdir('..')
results_df.to_csv("data/Banks_Philadelphia.csv", index = False)