# Prep the datasets to use in analysis

1. Clean both HMDA and census datasets.
1. Reduce the HMDA loan-level file to a bank-tract dataset.
1. Merge the bank-tract dataset with the census data.


In [3]:
import os
import csv
import pandas as pd
import numpy as np

os.makedirs('../input_data_clean/',exist_ok=True)


## HMDA Cleaning

- Clean the var types
- Better variable values (replace loan type 1 with name)

In [4]:
#todo when complete, delete this comment and uncomment the if, indent everything below
# if not os.path.exists('../input_data_clean/state_AZ-CA_clean.csv'):

#todo load the WHOLE thing when code is set
hmda = pd.read_csv('../input_data/state_AZ-CA_mini.csv')
hmda = hmda.replace(["Exempt","1111","8888","9999","NA"], np.nan)  

hmda['simplified_applicant_age_above_62'] = hmda['applicant_age_above_62'].map({'Yes': 1, 'No': 0})
hmda.drop(columns = ['applicant_age_above_62'],inplace=True)

hmda['state_simplified'] = hmda['state_code'].map({'AZ': 1, 'CA': 0})


In [5]:
hmda['loan_to_value_ratio'].describe()

count     33110
unique    15598
top        80.0
freq       2489
Name: loan_to_value_ratio, dtype: object

In [6]:
hmda = hmda.astype({
    'loan_to_value_ratio': float,
    'rate_spread': float,
    'prepayment_penalty_term': float,
    'intro_rate_period': float,
    'property_value': float,
    'total_loan_costs': float,
    'total_points_and_fees': float,
    'origination_charges': float,
    'discount_points': float,
    'lender_credits': float,
    'loan_term': float,
    'prepayment_penalty_term': float
})

#todo ltv is float with bins for the tails? wtf?
#todo use the output report to continue cleaning
#todo when done, save it:

hmda.to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

# hmda.to_csv('../input_data_clean/state_AZ-CA_clean.csv',index=False)
# hmda.sample(50000).to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

#todo check that it works: load the mini file (read_csv) you just saved and look at it... ok? same index? no extra var?

Save a report on the dataset. This will be useful to refer back to later on.

In [1]:
!pip install -U ydata-profiling

Collecting ydata-profiling
  Using cached ydata_profiling-4.0.0-py2.py3-none-any.whl (344 kB)
Collecting typeguard<2.14,>=2.13.2
  Using cached typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting pydantic<1.11,>=1.8.1
  Using cached pydantic-1.10.4-cp39-cp39-macosx_10_9_x86_64.whl (2.9 MB)
Collecting visions[type_image_path]==0.7.5
  Using cached visions-0.7.5-py3-none-any.whl (102 kB)
Collecting htmlmin==0.1.12
  Using cached htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting phik<0.13,>=0.11.1
  Using cached phik-0.12.3-cp39-cp39-macosx_10_13_x86_64.whl (652 kB)
Collecting multimethod<1.10,>=1.4
  Using cached multimethod-1.9.1-py3-none-any.whl (10 kB)
Collecting tangled-up-in-unicode>=0.0.4
  Using cached tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
Collecting imagehash
  Using cached ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
Building wheels for collected packages: htmlmin
  Building wheel for htmlmin (setup.py) ... [?25ldone
[?25

In [7]:
# to rerun this, delete the file
if not os.path.exists("mini_HMDA_partial_clean.html"):
    #!pip install -U ydata-profiling 
    from ydata_profiling import ProfileReport
    profile = ProfileReport(hmda, title="HMDA after existing cleaning steps", minimal=True)
    profile.to_file("mini_HMDA_partial_clean.html")
    
# todo after more cleaning... output to outputs/HMDA_profile_report.html instead    

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Census cleaning

1. Rename the variables into something usable.
1. Prepare a `census_tract` variable we can merge with the HMDA data. 

In [8]:
# this csv: col1 is census var ID name, col2 is the name we are using
# we are importing this into a dictionary bc pandas rename() wants that structure
rename_dict ={}
with open('../input_data/census_vars.csv') as f:
    for row in csv.reader(f):
        rename_dict[row[0]] = row[1]
    

In [11]:
#todo when the below is done
# if not os.path.exists('../input_data_clean/census_clean.csv'):

#todo verifying the merge steps with juan...
census = (
        pd.read_csv('../input_data/census.csv')
        # are we sure? these are adding the #s... don't you mean to concatenate them as strings?
        .assign(census_tract = lambda x: x["state"] + x["county"] + x["tract"])
        .rename(columns = rename_dict)
        # .drop(columns = ["state", "county", "tract"])
        #.to_csv('../input_data_clean/census_clean_sampel.csv',index=False)
)

#todo are these necessary?   depends on above
# census_clean["census_tract"] = census_clean["census_tract"].astype(str)
# census_clean["census_tract"] = census_clean["census_tract"].str.lstrip('0')
# census_clean["census_tract"] = census_clean["census_tract"].astype(int)

In [54]:
def census_concat(row):
    state  = str(row['state']) .zfill(2)
    county = str(row['county']) .zfill(3)
    tract  = str(row['tract']) .zfill(6)
    return int(state+county+tract)

census['census_tract'] = census.apply(census_concat, axis=1)
census[['census_tract','state','county','tract']].sample(50)

Unnamed: 0,census_tract,state,county,tract
569,6013353002,6,13,353002
9569,4027011403,4,27,11403
4251,6059087805,6,59,87805
2053,6037269906,6,37,269906
5123,6067007503,6,67,7503
5064,6067006201,6,67,6201
452,6013304002,6,13,304002
8534,4013112302,4,13,112302
4652,6065042408,6,65,42408
5398,6071002105,6,71,2105


In [55]:
census['census_tract'].count()

9583

In [56]:
census['census_tract'].nunique() # count = 9583, nunqiue = 9140

9583

In [58]:
census.to_csv('../input_data_clean/census_clean.csv',index=False)

## Reduce the loan-level HMDA dataset to a Bank-Tract level dataset

First, create variables we need before the aggregation step. 

In [None]:
#todo if not os.path.exists('../input_data_clean/bank_tract.csv'):

#todo when all good, redo this with FULL
hmda = pd.read_csv('../input_data/state_AZ-CA_mini.csv')

# 5 incomplete app
# 6 purchased loan 
# 7 is preapproval request denied              todo treat as app and deny?
# 8 preapproval approved but not accepted      todo treat as app and approve? change def below

hmda['approved'] = (hmda['action_taken'] == 2) | (hmda['action_taken'] == 1) 

hmda['application'] = (hmda['action_taken'] < 4 ) 

def lei_map(lei):
    if lei == 'QOT5WN9RBKQTFRVKEV31':
        return 'Bank of West'
    elif lei == '3Y4U8VZURTYWI1W2K376':
        return 'BMO Harris'
    elif lei == np.nan:
        return np.nan
    else: 
        return 'All Other Banks'

hmda['which_bank'] = hmda['lei'].map(lei_map)


In [None]:
#todo Other variables to add:

# $ of loans approved 
#     Create a new var = $loan if approved, na else; groupby will ignore na's while adding
# avg LTV (of all? or only of approved? < probably the latter)
    # if all, just add to groupby below. If approved, repeat the trick from above for creating $ of loans approved 
# avg spread (of all? or only of approved? < probably the latter)
# avg rate (of all? or only of approved? < probably the latter)
# avg loan size (of all? or only of approved? < probably the latter)


Second, aggregate by area:

In [None]:
bank_tract = hmda.groupby(['which_bank','census_tract']).agg(
    {'approved': 'sum', 
     'application': 'sum', 
     'loan_amount': 'sum',
     'rate_spread' : 'sum'.
     'income': 'median'}
)

# now, add denial rate and approval rate

# bank_tract.to_csv('../input_data_clean/bank_tract.csv',index=False)

# count apps and denials for each prod? for each purpose? for each combo?






## Merge the bank-tract data with the census tract level info

In [None]:

#todo, awaiting better 

failllll

#todo try on hmda mini, then hmda full...

#todo if not os.path.exists('../input_data_clean/bank_tract_clean_WITH_CENSUS.csv'):
(
    pd.read_csv('../input_data_clean/census_clean.csv')
    .merge(pd.read_csv('../input_data_clean/hmda_clean.csv'),
           on = "census_tract", 
           how = "inner",
           validate = "1:M")
    .to_csv('../input_data_clean/bank_tract_clean_WITH_CENSUS.csv')
)