In [1]:
import os
import requests
from bs4 import BeautifulSoup
import zipfile
import xport, csv
import pandas as pd
from pandas.api.types import infer_dtype
from datetime import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dateutil.relativedelta import relativedelta

Define the urls to extract from and the download folder directory

In [2]:
url1976_2000 = 'https://www.chicagofed.org/banking/financial-institution-reports/commercial-bank-data-complete-1976-2000'
url2001_2010 = 'https://www.chicagofed.org/banking/financial-institution-reports/commercial-bank-data-complete-2001-2010'
url2011_2021 = 'https://www.chicagofed.org/banking/financial-institution-reports/commercial-bank-structure-data'
root = 'https://www.chicagofed.org/'
download_folder = 'C:/Users/kwang648/Downloads/banking/'

In [3]:
start = '201803'
end = '202203'
start =  datetime.strptime(start, '%Y%m')
end =  datetime.strptime(end, '%Y%m')
quarters = (pd.date_range(start,end + pd.offsets.QuarterBegin(1), freq='Q').strftime('%y%m').tolist())

In [4]:
quarters

['1803',
 '1806',
 '1809',
 '1812',
 '1903',
 '1906',
 '1909',
 '1912',
 '2003',
 '2006',
 '2009',
 '2012',
 '2103',
 '2106',
 '2109',
 '2112',
 '2203']

In [5]:
def extract_links(url_list):
    print(url_list)
    zip_files = []
    for url in url_list:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        all_hrefs = soup.find_all('a')
        all_links = [link.get('href') for link in all_hrefs]
        temp = [dl for dl in all_links if '.zip' in dl or '.ZIP' in dl]
        temp = [dl.lower() for dl in temp]
        temp = [dl[:dl.index('.zip')+len('.zip')] for dl in temp]
        for dl in temp:
            zip_files.append(dl)
    return zip_files

In [6]:
def download_zip(zip_files):
    #if download folder does not exist, create one
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    for zip_file in zip_files:
        if re.findall("[0-9]{4}",zip_file)[0] in quarters:
            full_url = root + zip_file
            r = requests.get(full_url)
            zip_filename = os.path.basename(zip_file)
            dl_path = os.path.join(download_folder, zip_filename)
            with open(dl_path, 'wb') as z_file:
                z_file.write(r.content)

In [7]:
#this method is used to extract the links after 202106
def use_selenium(start,end):  
    if start >= datetime.strptime('202109', '%Y%m'):
        quarters = (pd.date_range(start, end + pd.offsets.QuarterBegin(1), freq='Q').strftime('%m/%d/%Y').tolist())
    else:
        quarters = (pd.date_range(pd.to_datetime('202109',format='%Y%m'), pd.to_datetime(end) + pd.offsets.QuarterBegin(1), freq='Q').strftime('%m/%d/%Y').tolist())
    # Create Driver Instance
    options = webdriver.ChromeOptions()
    prefs = {"download.default_directory":download_folder}
    options.add_experimental_option("prefs",prefs)
    driver = webdriver.Chrome(service=Service(executable_path='C:/Users/kwang648/Downloads/python_code/chromedriver_win32/chromedriver.exe'),options=options)
    url = 'https://cdr.ffiec.gov/public/PWS/DownloadBulkData.aspx'
    driver.get(url)
    #select "call report"
    products = Select(driver.find_element(By.ID,'ListBox1'))
    products.select_by_visible_text('Call Reports -- Single Period')
    #select TSV for the format
    driver.find_element(By.ID,'TSVRadioButton').click()
    for period in quarters:
        #drop down to select dates
        dropdown = Select(driver.find_element(By.XPATH,'//*[@id="DatesDropDownList"]'))
        dropdown.select_by_visible_text(period)
        #click download button
        download_button = driver.find_element(By.ID,"Download_0")
        download_button.click()
        time.sleep(5)

In [8]:
def rename_filename():
    os.chdir(download_folder)
    for item in os.listdir(download_folder):
        if bool(re.search('[0-9]{8}', item)):
            new_name = re.findall('[0-9]{8}', item)[0]
            new_name = "call"+datetime.strptime(new_name, '%m%d%Y').strftime('%y%m')+".zip"
            os.rename(item,new_name)

In [9]:
def unzip():
    os.chdir(download_folder)
    for item in os.listdir(download_folder):
        if item.endswith('.zip'):
            folder_name = item[:8]
            if not os.path.exists(download_folder+folder_name):
                os.mkdir(download_folder+folder_name)
            file_name = os.path.abspath(item)
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            zip_ref.extractall(download_folder+folder_name) # extract file to dir
            zip_ref.close() # close file
            os.remove(file_name) # delete zipped file

In [10]:
if start <= datetime.strptime('200012', '%Y%m'):
    if end <= datetime.strptime('200012', '%Y%m'):
        download_zip(extract_links([url1976_2000]))
    elif end <= datetime.strptime('201012', '%Y%m'):
        download_zip(extract_links([url1976_2000,url2001_2010]))
    elif end <= datetime.strptime('202106', '%Y%m'):
        download_zip(extract_links([url1976_2000,url2001_2010,url2011_2021]))
    else:
        download_zip(extract_links([url1976_2000,url2001_2010,url2011_2021]))
        use_selenium(start,end)
elif start <= datetime.strptime('201012', '%Y%m'):
    if end <= datetime.strptime('201012', '%Y%m'):
        download_zip(extract_links([url2001_2010]))
    elif end <= datetime.strptime('202106', '%Y%m'):
        download_zip(extract_links([url2001_2010,url2011_2021]))
    else:
        download_zip(extract_links([url2001_2010,url2011_2021]))
        use_selenium(start,end)
elif start <= datetime.strptime('202106', '%Y%m'):
    if end <= datetime.strptime('202106','%Y%m'):
        download_zip(extract_links([url2011_2021]))
    else:
        download_zip(extract_links([url2011_2021]))
        use_selenium(start,end)
else:
    use_selenium(start,end)
rename_filename()
unzip()

['https://www.chicagofed.org/banking/financial-institution-reports/commercial-bank-structure-data']


load variable definition

In [11]:
item_code = pd.read_csv('C:/Users/kwang648/Downloads/call_report_item_new.csv')
item_code = item_code.drop(['Notes','Description','citation'],axis=1)
#change end period to 2262-04-11 indicating the variable is currently in use
item_code['end'] = item_code['end'].apply(lambda x: 22620411 if x==99991231 else x)
#convert begin and end period to DateTime type
item_code['begin']=pd.to_datetime(item_code['begin'],format='%Y%m%d')
item_code['end']=pd.to_datetime(item_code['end'],format='%Y%m%d')
item_code = item_code.dropna(subset=['var_name'])

extract item code used in the definition

In [12]:
var_list = []
for code in item_code['item_code']:
    list = re.findall("[a-zA-Z0-9]+",code)
    for item in list:
        if item not in var_list:
            var_list.append(item)
var_list.extend(['Date','Entity','ENTITY','DT','ID_RSSD','dt','id_rssd','IDRSSD', 'FDIC Certificate Number', 'OCC Charter Number', 'OTS Docket Number', 'Primary ABA Routing Number', 'Financial Institution Name', 'Financial Institution Address', 'Financial Institution City', 'Financial Institution State', 'Financial Institution Zip Code', 'Financial Institution Filing Type', 'Last Date/Time Submission Updated On'])

Convert text to csv, and merge

In [13]:
os.chdir(download_folder)
folder_list = os.listdir(download_folder)
for item in folder_list:
    if datetime.strptime(re.findall('[0-9]{4}',item)[0], '%y%m') >= datetime.strptime('202109', '%Y%m'):
        os.chdir(download_folder+item)
        merged = pd.DataFrame()
        for text in os.listdir(download_folder+item):
            if text == 'Readme.txt':
                #os.remove(text)
                continue
            if merged.empty:
                merged = pd.read_csv(text, delimiter = "\t",on_bad_lines='skip')
                merged['IDRSSD'] = pd.to_numeric(merged['IDRSSD'], errors='coerce')
            else:
                curr_list =  pd.read_csv(text, delimiter = "\t",on_bad_lines='skip').columns.tolist()
                load_list = [element for element in var_list if element in curr_list]
                if len(load_list)==1:
                    #os.remove(text)
                    continue
                new = pd.read_csv(text, delimiter = "\t",usecols=load_list,on_bad_lines='skip')
                new['IDRSSD'] = pd.to_numeric(new['IDRSSD'], errors='coerce')
                merged = merged.merge(new,on='IDRSSD',suffixes=('', '_remove'))
                merged = merged.loc[:,~merged.columns.str.contains('Unnamed')]
                merged = merged.dropna(subset=['IDRSSD'])
                merged.to_csv(download_folder+item+'.csv')
            #os.remove(text)
        merged.drop([i for i in merged.columns if 'remove' in i],axis=1, inplace=True)
        merged['Reporting Period End Date'] = pd.Series([datetime.strptime(re.findall('[0-9]{4}',item)[0], '%Y%m')+relativedelta(day=31) for x in range(len(df.index))])
        merged.to_csv(download_folder+item+'.csv')
    else:
        os.chdir(download_folder+item)
        for xpt in os.listdir(download_folder+item):
            with open(xpt, 'rb') as f:
                df = xport.to_dataframe(f)
            total_list = df.columns.tolist()
            load_list = [element for element in var_list if element in total_list]
            df = df[[c for c in df.columns if c in load_list]]
            df.to_csv(download_folder+item+'.csv')

In [None]:
def rename_column():
    