In [None]:
# "%04d.1f" % (1000,)

## Setup

In [None]:
from helper_fcns import *
import os
pd.options.display.max_rows = 25
pd.options.display.max_columns = 999

In [None]:
if os.path.exists('data') is False:
    os.mkdir('data')

# Run each data ingestion pipeline

## Census data

In [None]:
# Ingest census tract data. User will need one of the csv files from the following urls downloaded in the same directory as this function to run it:
# Dallas -> https://data.census.gov/table/DECENNIALPL2020.P4?g=050XX00US48113$1400000&y=2020&d=DEC+Redistricting+Data+(PL+94-171)
# Collin -> https://data.census.gov/table/DECENNIALPL2020.P4?g=050XX00US48085$1400000&y=2020&d=DEC+Redistricting+Data+(PL+94-171)
# Tarrant -> https://data.census.gov/table/DECENNIALPL2020.P4?g=050XX00US48439$1400000&y=2020&d=DEC+Redistricting+Data+(PL+94-171)
census_df = census_data_ingester('data/DECENNIALPL2020.P4-2023-10-18T000916.csv')

census_df

## FFIEC Census Data

In [None]:
# Ingest ffiec census data. User will need to have the files at the following urls downloaded in the same directory as this function to run it(THESE URLS WILL DOWNLOAD THE FILE WHEN PASTED IN BROWSER):
# 2022 Flat File -> https://www.ffiec.gov/Census/Census_Flat_Files/CensusFlatFile2022.zip
# FileDefinitions -> https://www.ffiec.gov/Census/Census_Flat_Files/FFIEC_Census_File_Definitions_26AUG22.xlsx
ffiec_data = ffiec_flat_file_extractor('data/CensusFlatFile2022.csv','data/FFIEC_Census_File_Definitions_26AUG22.xlsx')

ffiec_data

## HMDA Data

In [None]:
# Ingest files from HMDA website. User will need to have files at the following urls downloaded in the same directory as these functions to run them(THESE URLS WILL DOWNLOAD THE FILE WHEN PASTED IN BROWSER):
# LAR -> https://s3.amazonaws.com/cfpb-hmda-public/prod/snapshot-data/2022/2022_public_lar_csv.zip
# TS -> https://s3.amazonaws.com/cfpb-hmda-public/prod/snapshot-data/2022/2022_public_ts_csv.zip
# Panel -> https://s3.amazonaws.com/cfpb-hmda-public/prod/snapshot-data/2022/2022_public_panel_csv.zip
# MSA/MD Description - > https://s3.amazonaws.com/cfpb-hmda-public/prod/snapshot-data/2022/2022_public_msamd_csv.zip
hmda_dict = hmda_data_ingester('a') # can be any string currently 

print(hmda_dict['lar_df'].aus_1.value_counts())

hmda_dict['lar_df'].head(10)

In [None]:
hmda_dict['panel_df']

## CRA Data

In [None]:
# Ingest cra data. User will need to have zip files at the following urls downloaded in in the same directory as these functions to run them(THESE URLS WILL DOWNLOAD THE FILE WHEN PASTED IN BROWSER):
# 2021 Agg Data -> https://www.ffiec.gov/cra/xls/21exp_aggr.zip
# 2021 Discl Data -> https://www.ffiec.gov/cra/xls/21exp_discl.zip
cra_dict = cra_data_ingester('t') # can be any string currently 
cra_dict_no_fips = cra_mapping_function(cra_dict)
fcc_fips_url = 'https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt'
fcc_fips = fcc_fips_mappings_getter(fcc_fips_url)
cra_dict_mapped_fips = state_county_fips_mapper(cra_dict_no_fips, fcc_fips)
final_cra_dict = thousands_adder(cra_dict_mapped_fips)

# Small business loans by County level
final_cra_dict['cra2021_Discl_D11.dat']

In [None]:
# 
final_cra_dict['cra2021_Discl_D6.dat']

In [None]:
d11 = final_cra_dict['cra2021_Discl_D11.dat']
d11 = d11[d11["Agency Code"] == "FDIC"]
d11 = d11[d11["State"] == "TEXAS"]
d11

In [None]:
d11.groupby('Agency Code').size()

In [None]:
d11.groupby('Loan Type').size()

In [None]:
d11.groupby('Action Taken Type').size() 

In [None]:
d11.columns

In [None]:
# list(final_cra_dict.keys())

## FDIC Institutions

In [None]:
# Ingest fdic institutions and locations data. User will need to have files from the following urls downloaded in the same directory as these functions to run them (THESE URLS WILL DOWNLOAD THE FILE WHEN PASTED IN BROWSER):
# Institutions -> https://s3-us-gov-west-1.amazonaws.com/cg-2e5c99a6-e282-42bf-9844-35f5430338a5/downloads/institutions.csv
# Institution defs -> https://banks.data.fdic.gov/docs/institutions_definitions.csv
replace_map_columns = changec_label_adder('data/institutions_definitions.csv')
fdic_institutions_df = fdic_institutions_ingester('data/institutions.csv', replace_map_columns)
fdic_institutions_df

## FDIC Locations

In [None]:
# locations -> https://s3-us-gov-west-1.amazonaws.com/cg-2e5c99a6-e282-42bf-9844-35f5430338a5/downloads/locations.csv
# location defs -> https://banks.data.fdic.gov/docs/locations_definitions.csv
fdic_locations_df = fdic_locations_mapper('data/locations_definitions.csv','data/locations.csv')
fdic_locations_df

## SBA

In [None]:
# Ingest foia 7a data from 2020 to present. User will need url of foia 7a data from 2020 to present to run this function.
# foia 7a data from 2020 to present url :'https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/c71ba6cf-b4e0-4e60-98f0-48aeaf4c6460/download/foia-7afy2020-present-asof-230630.csv'
url = 'https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/c71ba6cf-b4e0-4e60-98f0-48aeaf4c6460/download/foia-7afy2020-present-asof-230630.csv'
sba_data = sba_data_ingester(url)
sba_data.head()

In [None]:
if False:
    [
     # borrower details
     'Borrower name', 'Borrower street address', 'Borrower city', 'Borrower state', 'Borrower zip code', 'ProjectCounty', 'ProjectState',  'BusinessType', 'BusinessAge', 'JobsSupported', 'SBADistrictOffice','CongressionalDistrict',
     
     # Bank details
     'BankName','BankFDICNumber', 'BankNCUANumber', 'BankStreet', 'BankCity', 'BankState', 'BankZip', 
    
     # Loan characteristics
     'Program', 'Total loan amount', 'SBAGuaranteedApproval', 'ApprovalDate', 'ApprovalFiscalYear', 'FirstDisbursementDate','DeliveryMethod', 'Subprogram description', 'InitialInterestRate', 'TermInMonths', 'NaicsCode', 'NaicsDescription', 'FranchiseCode', 'FranchiseName',  'LoanStatus', 'PaidInFullDate', 'ChargeOffDate', 'GrossChargeOffAmount', 'RevolverStatus',  'SOLDSECMRTIND'
    ]

In [None]:
# fdic_institutions_df[fdic_institutions_df["FDIC's unique number"] == 90297]

In [None]:
if False:
    fdic_num = fdic_institutions_df["FDIC's unique number"].copy().unique()
    fdic_num.sort()
    fdic_num

# Minimal Data Analysis Subset