# Prep the datasets to use in analysis

1. Clean both HMDA and census datasets.
1. Reduce the HMDA loan-level file to a bank-tract dataset.
1. Merge the bank-tract dataset with the census data.


In [1]:
import os
import csv
import pandas as pd
import numpy as np

os.makedirs('../input_data_clean/',exist_ok=True)


## HMDA Cleaning

- Clean the var types
- Better variable values (replace loan type 1 with name)

In [3]:
#todo when complete, delete this comment and uncomment the if, indent everything below
# if not os.path.exists('../input_data_clean/state_AZ-CA_clean.csv'):

#todo load the WHOLE thing when code is set
hmda = pd.read_csv('../input_data/state_AZ-CA_mini.csv')
hmda = hmda.replace(["Exempt","1111",1111,99999,"8888","9999",999.99,"NA"], np.nan) 
    

hmda['simplified_applicant_age_above_62'] = hmda['applicant_age_above_62'].map({'Yes': 1, 'No': 0})
hmda.drop(columns = ['applicant_age_above_62'],inplace=True)

hmda['state_simplified'] = hmda['state_code'].map({'AZ': 1, 'CA': 0})


In [17]:
hmda["action_taken"] = hmda["action_taken"].replace({
    1:"Loan Originated",
    2:"Application approved but not accepted",
    3:"Application denied",
    4:"Application withdrawn by applicant",
    5:"File closed for incompleteness",
    6:"Purchased loan",
    7:"Preapproval request denied",
    8:"Preapproval request approved but not accepted"
})
hmda["purchaser_type"] = hmda["purchaser_type"].replace({
    0:"Not applicable",
    1:"Fannie Mae",
    2:"Ginnie Mae",
    3:"Freddie Mac",
    4:"Farmer Mac",
    5:"Private securitizer",
    6:"Commercial bank savings bank or savings association",
    71:"Credit union mortgage company or finance company",
    72:"Life insurance company",
    8:"Affiliate institution",
    9:"Other Type of purchaser"
})
hmda["preapproval"] = hmda["preapproval"].replace({
    1:"Preapproval requested",
    2:"Preapproval not requested"
})
hmda["loant_type"] = hmda["loan_type"].replace({
    1:"Conventional",
    2:"Federal Housing Administration FHA",
    3:"Veterans Affiars guaranteed VA",
    4:"USDA Rural Housing Service or the Farm Service Agency guaranteed RHS or FSA"
})
hmda["loan_purpose"] = hmda["loan_purpose"].replace({
    1:"Home purchase",
    2:"Home improvement",
    31:"Refinancing",
    32:"Cash-out refinancing",
    4:"Other purpose",
    5:"Not applicable",
})
hmda["lien_status"] = hmda["lien_status"].replace({
    1:"Secured by a first lien",
    2:"Secured by a subordinate lien"
})
hmda["hoepa_status"] = hmda["hoepa_status"].replace({
    1:"High-cost mortgage",
    2:"Not a high-cost mortgage",
    3:"Not applicable"
})
hmda["reverse_mortgage"] = hmda["reverse_mortgage"].replace({
    1:"Reverse mortgage",
    2:"Not a reverse mortgage",
    1111:"Exempt"
})
hmda["open-end_line_of_credit"] = hmda["open-end_line_of_credit"].replace({
    1:"Open-end line of credit",
    2:"Not an open-end line of credit",
    1111:"Exempt"
})
hmda["business_or_commercial_purpose"] = hmda["business_or_commercial_purpose"].replace({
    1:"Primarily for a business or commercial purpose",
    2:"Not primarily for a business or commercial purpose",
    1111:"Exempt"
})
hmda["negative_amortization"] = hmda["negative_amortization"].replace({
    1:"Negative amortization",
    2:"No negative amortization",
    1111:"Exempt"
})
hmda["interest_only_payment"] = hmda["interest_only_payment"].replace({
    1:"Interest-only payment",
    2:"No interest-only payments",
    1111:"Exempt"
})
hmda["balloon_payment"] = hmda["balloon_payment"].replace({
    1:"Balloon payment",
    2:"No balloon payment",
    1111:"Exempt"
})
hmda["other_nonamortizing_features"] = hmda["other_nonamortizing_features"].replace({
    1:"Other non-fully amortizing features",
    2:"No other non-fully amortizing features",
    1111:"Exempt"
})
hmda["construction_method"] = hmda["construction_method"].replace({
    1:"Site-built",
    2:"Manufactured Home",
    1111:"Exempt"
})
hmda["occupancy_type"] = hmda["occupancy_type"].replace({
    1:"Principal residence",
    2:"Second residence",
    3:"Investment Property"
})
hmda["applicant_credit_score_type"] = hmda["applicant_credit_score_type"].replace({
    1:"Equifax Beacon 5.0",
    2:"Experian Fair Isaac",
    3:"FICO Risk Score Classic 04",
    4:"FICO Risk Score Classic 98",
    5:"VintageScore 2.0",
    6:"VintageScore 3.0",
    7:"More than one credit scoring model",
    8:"Other credit scoring model",
    9:"Not Applicable"
})
hmda["co-applicant_credit_score_type"] = hmda["co-applicant_credit_score_type"].replace({
    1:"Equifax Beacon 5.0",
    2:"Experian Fair Isaac",
    3:"FICO Risk Score Classic 04",
    4:"FICO Risk Score Classic 98",
    5:"VintageScore 2.0",
    6:"VintageScore 3.0",
    7:"More than one credit scoring model",
    8:"Other credit scoring model",
    9:"Not Applicable",
    10:"No co-applicant"
})
#did not include eth-4 or 5
eths_to_modify = ["applicant_ethnicity-1","applicant_ethnicity-2","applicant_ethnicity-3",
            "co-applicant_ethnicity-1","co-applicant_ethnicity-2","co-applicant_ethnicity-3"]
eths_replace={
    1:"Hispanic or Latino",
    2:"Mexican",
    3:"Puerto Rican",
    4:"Cuban",
    5:"Other Hispanic or Latino",
    2:"Not Hispanic or Latino",
    3:"Info not provided by applicant in mail internet or telephone appplication",
    4:"Not Applicable"
}
for col in eths_to_modify:
    hmda[col] = hmda[col].replace(eths_replace)


race_to_modify = ["applicant_race-1","applicant_race-2","applicant_race-3",
                  "applicant_race-4","applicant_race-5",
                  "co-applicant_race-1","co-applicant_race-2","co-applicant_race-3",
                  "co-applicant_race-4","co-applicant_race-5"]
race_replace={
    1:"American Inidian or Alaska Native",
    2:"Asian",
    21:"Asian Indian",
    22:"Chinese",
    23:"Filipino",
    24:"Japanese",
    25:"Korean",
    26:"Vietnamese",
    27:"Other Asian",
    3:"Black or African American",
    4:"Native Hawaiian or Other Pacific Islander",
    41:"Native Hawaiian",
    42:"Guamanian",
    43:"Samoan",
    44:"Other Pacific Islander",
    5:"White",
    6:"Info not provided by applicant in mail internet or telephone appplication",
    7:"Not Applicable",
    8:"No co-applicant"
}
for col in race_to_modify:
    hmda[col] = hmda[col].replace(race_replace)

obs_to_modify = ["applicant_ethnicity_observed","co-applicant_ethnicity_observed","applicant_race_observed",
            "co-applicant_race_observed","applicant_sex_observed","co-applicant_sex_observed"]
obs_replace={
    1:"Collected on the basis of visual observation or surname",
    2:"Not collected on the basis of visual observation or surname",
    3:"Not applicable",
    4:"No co-applicant"
}
for col in obs_to_modify:
    hmda[col] = hmda[col].replace(obs_replace)
    
hmda["applicant_sex"] = hmda["applicant_sex"].replace({
    1:"Male",
    2:"Female",
    3:"Information not provided by applicant in mail internet or telephone application",
    4:"Not applicable",
    5:"No co-applicant",
    6:"Applicant selected both male and female"
})
hmda["co-applicant_sex"] = hmda["co-applicant_sex"].replace({
    1:"Male",
    2:"Female",
    3:"Information not provided by applicant in mail internet or telephone application",
    4:"Not applicable",
    5:"No co-applicant",
    6:"Applicant selected both male and female"
})
dens_to_modify = ["denial_reason-1","denial_reason-2","denial_reason-3"]
dens_replace={
    1:"Debt-to-income ratio",
    2:"Employment history",
    3:"Credit history",
    4:"Collateral",
    5:"Insufficient cash downpayment closing costs",
    6:"Unverifiable information",
    7:"Credit application incomplete",
    8:"Mortgage insurance denied",
    9:"Other",
    10:"Not applicable",
    1111:"Exempt"
}
for col in dens_to_modify:
    hmda[col] = hmda[col].replace(dens_replace)

## hmda['loan_to_value_ratio'].describe()
hmda['interest_rate'].describe()
hmda['applicant_ethnicity-1'].describe()

In [4]:
#rate_spread & income have negative values, ASK
hmda = hmda.astype({
    'loan_to_value_ratio': float,
    'interest_rate': float,
    'rate_spread': float,
    'prepayment_penalty_term': float,
    'intro_rate_period': float,
    'property_value': float,
    'total_loan_costs': float,
    'total_points_and_fees': float,
    'origination_charges': float,
    'discount_points': float,
    'lender_credits': float,
    'loan_term': float,
    'prepayment_penalty_term': float
})

#todo ltv is float with bins for the tails? wtf?
#todo use the output report to continue cleaning
#todo when done, save it:

hmda.to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

# hmda.to_csv('../input_data_clean/state_AZ-CA_clean.csv',index=False)
# hmda.sample(50000).to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

#todo check that it works: load the mini file (read_csv) you just saved and look at it... ok? same index? no extra var?

Save a report on the dataset. This will be useful to refer back to later on.

In [None]:
# to rerun this, delete the file
if not os.path.exists("mini_HMDA_partial_clean.html"):
    #!pip install -U ydata-profiling 
    from ydata_profiling import ProfileReport
    profile = ProfileReport(hmda, title="HMDA after existing cleaning steps", minimal=True)
    profile.to_file("mini_HMDA_partial_clean.html")
    
# todo after more cleaning... output to outputs/HMDA_profile_report.html instead    

## Census cleaning

1. Rename the variables into something usable.
1. Prepare a `census_tract` variable we can merge with the HMDA data. 

In [None]:
# this csv: col1 is census var ID name, col2 is the name we are using
# we are importing this into a dictionary bc pandas rename() wants that structure
rename_dict ={}
with open('../input_data/census_vars.csv') as f:
    for row in csv.reader(f):
        rename_dict[row[0]] = row[1]
    

In [None]:
#todo when the below is done
# if not os.path.exists('../input_data_clean/census_clean.csv'):

#todo verifying the merge steps with juan...
census = (
        pd.read_csv('../input_data/census.csv')
        # are we sure? these are adding the #s... don't you mean to concatenate them as strings?
        .assign(census_tract = lambda x: x["state"] + x["county"] + x["tract"])
        .rename(columns = rename_dict)
        # .drop(columns = ["state", "county", "tract"])
        # .to_csv('../input_data_clean/census_clean.csv',index=False)
)

#todo are these necessary?   depends on above
# census_clean["census_tract"] = census_clean["census_tract"].astype(str)
# census_clean["census_tract"] = census_clean["census_tract"].str.lstrip('0')
# census_clean["census_tract"] = census_clean["census_tract"].astype(int)


In [None]:
census['census_tract'].nunique() # count = 9583, nunqiue = 9140

## Reduce the loan-level HMDA dataset to a Bank-Tract level dataset

First, create variables we need before the aggregation step. 

In [None]:
#todo if not os.path.exists('../input_data_clean/bank_tract.csv'):

#todo when all good, redo this with FULL
hmda = pd.read_csv('../input_data/state_AZ-CA_mini.csv')

# 5 incomplete app
# 6 purchased loan 
# 7 is preapproval request denied              todo treat as app and deny?
# 8 preapproval approved but not accepted      todo treat as app and approve? change def below

hmda['approved'] = (hmda['action_taken'] == 2) | (hmda['action_taken'] == 1) 

hmda['application'] = (hmda['action_taken'] < 4 ) 

def lei_map(lei):
    if lei == 'QOT5WN9RBKQTFRVKEV31':
        return 'Bank of West'
    elif lei == '3Y4U8VZURTYWI1W2K376':
        return 'BMO Harris'
    elif lei == np.nan:
        return np.nan
    else: 
        return 'All Other Banks'

hmda['which_bank'] = hmda['lei'].map(lei_map)


In [None]:
#todo Other variables to add:

# $ of loans approved 
#     Create a new var = $loan if approved, na else; groupby will ignore na's while adding
# avg LTV (of all? or only of approved? < probably the latter)
    # if all, just add to groupby below. If approved, repeat the trick from above for creating $ of loans approved 
# avg spread (of all? or only of approved? < probably the latter)
# avg rate (of all? or only of approved? < probably the latter)
# avg loan size (of all? or only of approved? < probably the latter)


Second, aggregate by area:

In [None]:
bank_tract = hmda.groupby(['which_bank','census_tract']).agg(
    {'approved': 'sum', 
     'application': 'sum', 
     'loan_amount': 'sum',
     'rate_spread' : 'sum'.
     'income': 'median'}
)

# now, add denial rate and approval rate

# bank_tract.to_csv('../input_data_clean/bank_tract.csv',index=False)

# count apps and denials for each prod? for each purpose? for each combo?






## Merge the bank-tract data with the census tract level info

In [None]:

#todo, awaiting better 

failllll

#todo try on hmda mini, then hmda full...

#todo if not os.path.exists('../input_data_clean/bank_tract_clean_WITH_CENSUS.csv'):
(
    pd.read_csv('../input_data_clean/census_clean.csv')
    .merge(pd.read_csv('../input_data_clean/hmda_clean.csv'),
           on = "census_tract", 
           how = "inner",
           validate = "1:M")
    .to_csv('../input_data_clean/bank_tract_clean_WITH_CENSUS.csv')
)