In [3]:
import requests
import pandas as pd 
import csv
import configparser
from census import Census
from bs4 import BeautifulSoup
import os

### FDIC API 

In [3]:
# function to scrape fdic api and save it to a csv file 
def fdic_api(url, params, file_path, data_points: int): 
    """
    This function retrieves data from the FDIC API in batches, saving it to a CSV file. 
    It uses pagination by adjusting the 'offset' and 'limit' query parameters for each batch.

    Parameters: 
    url - base url of the FDIC api
    params - query parameters for the API (e.g. fields that needs to be included in the dataset being retrieved, format (csv, txt, json etc)
    file_path - file path where retrieved file gets saved 
    data_points - total number of data points available for retrieval (found in the metadata)
    """
    # define total number of fdic limit per API request
    fdic_limit = 10000

    # calculate number of batches to retrieve all data
    total_data_pts = data_points
    num_of_batches = (total_data_pts // fdic_limit) + (1 if total_data_pts % fdic_limit > 0 else 0) # if remainder is greater than 0, add 1 to the num batches, otherwise add 0

    with open(file_path, 'w', newline = '', encoding = 'utf-8') as file: 
        writer = None

        # update query parameters for each batch
        for batch in range(num_of_batches): 
            params['offset'] = batch * fdic_limit 
            params['limit'] = fdic_limit # ensures the api only fetches 10000 data points at a time 

            # make the api request
            response = requests.get(url, params)
            if response.status_code == 200: 
                try: 
                    response_data = response.text.splitlines()
                    reader = csv.reader(response_data) # parse the response data into rows using csv.reader
                    # write data to file
                    if writer is None: 
                        writer = csv.writer(file) # writes rows to the file
                        writer.writerows(reader)
                    else: 
                        next(reader) # skips header row for all other iterations
                        writer.writerows(reader)
                except Exception as e:
                    print(f'An error occurred while reading batch {batch + 1}: {e}')
                    raise
            else: 
                print(f'Error: {response.status_code}')
                print(f'{response.text}')
                break
    print(f'All data retrieved successfully. {total_data_pts} data points saved to {file_path}')
                
    # above function works if api has a limit (needs pagination), otherwise the function below works: 
    # response = requests.get(url, params)
    # if response.status_code == 200:
    #     try: 
    #         response_data = response.text.splitlines()
    #         reader = csv.reader(response_data)
    #         with open(file_path, 'w', newline = '', encoding = 'utf-8') as file: 
    #             writer = csv.writer(file)
    #             writer.writerows(reader)
    #     except Exception as e:
    #         print(f'An error occurred while writing to the file: {e}')
    #         raise
    # else:
    #     print(f'Error: {response.status_code}')
    #     print(f'{response.text}')

In [4]:
# fdic api has a limit of 10000 data points per request
# institutions file - FDIC API

# base_url_institutions = 'https://banks.data.fdic.gov/api/institutions?'
# insti_params = {
#     'fields': 'ACTIVE,ADDRESS,ADDRESS2,ASSET,BKCLASS,CBSA,CBSA_DIV,CBSA_DIV_FLG,CBSA_DIV_NO,CBSA_METRO,CBSA_METRO_FLG,CBSA_METRO_NAME,CBSA_MICRO_FLG,CBSA_NO,CITY,CLCODE,COUNTY,ENDEFYMD,ESTYMD,FED,FED_RSSD,INACTIVE,LATITUDE,LONGITUDE,NAME,NETINC,OFFDOM,OFFICES,OFFOA,STCNTY,STNAME,STNUM,UNINUM,WEBADDR,ZIP',
#     'format': 'csv'
# }
# insti_file_path = 'institutions_data.csv'
# insti_data_pts = 27825

# fdic_api(base_url_institutions, insti_params, insti_file_path, insti_data_pts)

In [5]:
# locations file - FDIC API 

# base_url_locations = 'https://banks.data.fdic.gov/api/locations?'
# loc_params = {
#     'fields': 'ADDRESS,BKCLASS,CBSA,CBSA_DIV,CBSA_DIV_FLG,CBSA_DIV_NO,CBSA_METRO,CBSA_METRO_FLG,CBSA_METRO_NAME,CBSA_MICRO_FLG,CBSA_NO,CITY,COUNTY,ESTYMD,MAINOFF,NAME,OFFNAME,OFFNUM,SERVTYPE,STALP,STCNTY,STNAME,UNINUM,ZIP',
#     'format': 'csv',
#     'limit': 10000,
#     'offset': 0
# }
# loc_file_path = 'locations_data.csv'
# loc_data_pts = 78908

# fdic_api(base_url_locations, loc_params, loc_file_path, loc_data_pts)

In [6]:
# failures (list of bank failures up to data) - FDIC API 

# base_url_failures = 'https://banks.data.fdic.gov/api/failures?'
# fail_params = {
#     'fields': 'NAME,CITYST,FAILDATE,FAILYR,CHCLASS1,RESDATE,RESTYPE,QBFDEP,QBFASSET,COST,PSTALP',
#     'format': 'csv', 
#     'limit': 10000,
#     'offset': 0
# }
# fail_file_path = 'failures_data.csv'
# fail_data_pts = 4111

# fdic_api(base_url_failures, fail_params, fail_file_path, fail_data_pts)

In [7]:
# demographics (summary of demographic information) - FDIC API 
# demographics filtered using CALLYM from Jan 2015 - Jan 2025

# base_url_demographics = 'https://banks.data.fdic.gov/api/demographics?'
# demo_params = {
#     'filters': 'CALLYM:["201501" TO "202501"]',
#     'fields': 'ACTEVT,BRANCH,CALLYM,CALLYMD,CBSANAME,CERT,CLCODE,CMSA,CNTRYALP,CNTRYNUM,CNTYNUM,CSA,DIVISION,FDICAREA,METRO,MNRTYCDE,OFFDMULT,OFFTOT,OFFSTATE,WEBADDR',
#     'format': 'csv',
#     'limit': 10000,
#     'offset': 0
# }
# demo_file_path = 'demographics_data.csv'
# demo_data_pts = 190714

# fdic_api(base_url_demographics, demo_params, demo_file_path, demo_data_pts)

In [8]:
# FDIC Summary of Deposits 

# base_url_sod = 'https://banks.data.fdic.gov/api/sod?'
# sod_params = {
#     'filters': 'YEAR:[2024 TO 2025]',
#     'fields': 'ADDRESSBR,ADDRESS,ASSET,BKCLASS,CERT,CITY,CITYBR,CNTRYNA,CNTRYNAB,CNTYNAMB,CNTYNUMB,DEPSUM,DEPSUMBR,NAMEBR,NAMEFULL,SIMS_LATITUDE,SIMS_LONGITUDE,STALP,STALPBR,STALPHCR,STCNTY,STCNTYBR,STNAME,STNAMEBR,STNUMBR,UNINUMBR,ZIP,ZIPBR',
#     'format': 'csv',
#     'limit': 10000, 
#     'offset': 0
# }

# sod_file_path = 'sod_data.csv'
# sod_data_pts = 76742

# fdic_api(base_url_sod, sod_params, sod_file_path, sod_data_pts)

### US Census Bureau API

In [10]:
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config['USCensus']['api_key']
c = Census(api_key)

In [11]:
# ACS 5 year estimate
# Data Profiles contain broad social, economic, housing, and demographic information. 
# The data are presented as estimates and percentages. 
# Data Profiles are available down to the census tract level.

# acs5_dp_response = c.acs5.get(
#     fields = acs5_dp_variables,
#     geo = {"for": "tract:*"} # census tract level
# )

### Census Tract tigerWEB Shapefiles 

In [13]:
ct_shp_url = 'https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer/4'

### Geocoding addresses
- collecting data (geocoding addresses) from the US Census Bureau's Geocoding services web API 

In [15]:
# create a function to batch geocode 
def geocode(url, params, input_file, output_file):
    with open(input_file, 'rb') as file: 
        files = {'addressFile': file}
        response = requests.post(url = url, params = params, files = files)
        if response.status_code == 200:
            try: 
                with open(output_file, 'wb') as output: 
                    output.write(response.content)
                print(f'Geocoded results saved to {output_file}')
            except Exception as e: 
                print(f'An error ocurred while geocoding: {e}')
        else: 
            print(f'Error: {response.status_code}, {response.text}')

In [16]:
batch_geocode_url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'

geocode_params = {
    'returntype': 'geographies',
    'benchmark': 'Public_AR_Current',
    'vintage': 'Current_Current'
}

In [17]:
# create a loop for all the nine address csv file that needs to be geocoded 
# geocoding also filtered the locations data - there was NO MATCH for US owned bank branches outside of the US 
# all geocoded addresses must be concatenated in one csv file 

# for i in range(1, 10):
#     geocode_input_file = f'addresses_batch_{i}.csv'
#     geocode_output_file = f'addresses_geocoded_{i}.csv'
#     geocode(batch_geocode_url, geocode_params, geocode_input_file, geocode_output_file)

Geocoded results saved to addresses_geocoded_1.csv
Geocoded results saved to addresses_geocoded_2.csv
Geocoded results saved to addresses_geocoded_3.csv
Geocoded results saved to addresses_geocoded_4.csv
Geocoded results saved to addresses_geocoded_5.csv
Geocoded results saved to addresses_geocoded_6.csv
Geocoded results saved to addresses_geocoded_7.csv
Geocoded results saved to addresses_geocoded_8.csv
Geocoded results saved to addresses_geocoded_9.csv


### TIGERline Shapefiles for Census Tracts
- Use Beautiful Soup to parse through the HTML and download all the TIGER/line shapefiles
- Can also cartographic shapefiles if precision in boundary is not a major feature in the analysis. This link contains the 2022 cartographic boundary for Census Tracts in the US: https://catalog.data.gov/dataset/2022-cartographic-boundary-file-shp-current-census-tract-for-united-states-1-500000
- Otherwise, it's better to use the TIGER/line shapefile for more detailed geographic information. This link contains the 2024 TIGER/line shapefiles for Census Tracts provided by the US Census Bureau: https://www2.census.gov/geo/tiger/TIGER2024/TRACT/

In [None]:
tiger_shp_url = 'https://www2.census.gov/geo/tiger/TIGER2024/TRACT/'

# send a request and parse the webpage using beautifulsoup
response = requests.get(tiger_shp_url)
bsoup = BeautifulSoup(response.text, 'html.parser')

# create a folder to save the downloaded files
os.makedirs('shapefiles', exist_ok = True)

# loop through all the anchor tags with href 
def find_shp(url): 
    for i, tr_tag in enumerate(bsoup.find_all('tr')):
        if i < 3:
            continue

        for anchor in tr_tag.find_all('a', href = True):
            shp_name = anchor['href']
            if shp_name.endswith('.zip'):
                shp_url = url + shp_name
                shp_path = os.path.join('shapefiles', shp_name)
                
            


        