In [1]:
import os
import csv
import pandas as pd
import numpy as np

os.makedirs('../input_data_clean/',exist_ok=True)

In [2]:
# Load clean HMDA Dataset
#todo when complete, delete this comment and uncomment the if, indent everything below
# if not os.path.exists('../input_data_clean/state_AZ-CA_clean.csv'):

hmda = pd.read_csv('../input_data/state_AZ-CA_mini.csv')

##################################################################
# SAMPLE RESTRICTION: ACTION TAKEN = 1,2,3 ONLY
##################################################################

hmda = hmda.query('action_taken <= 3') 

##################################################################
# START CLEANING
##################################################################

hmda = hmda.replace(["Exempt","1111",1111,"99999",99999,"8888",8888,"9999",9999,"999.99",999.99,"NA","nan"], np.nan)  

hmda['simplified_applicant_age_above_62'] = hmda['applicant_age_above_62'].map({'Yes': 1, 'No': 0})
hmda.drop(columns = ['applicant_age_above_62'],inplace=True)

hmda['state_simplified'] = hmda['state_code'].map({'AZ': 1, 'CA': 0})


hmda["action_taken"] = hmda["action_taken"].replace({
    1:"Loan Originated",
    2:"Application approved but not accepted",
    3:"Application denied",
    4:"Application withdrawn by applicant",
    5:"File closed for incompleteness",
    6:"Purchased loan",
    7:"Preapproval request denied",
    8:"Preapproval request approved but not accepted"
})
hmda["purchaser_type"] = hmda["purchaser_type"].replace({
    0:"Not applicable",
    1:"Fannie Mae",
    2:"Ginnie Mae",
    3:"Freddie Mac",
    4:"Farmer Mac",
    5:"Private securitizer",
    6:"Commercial bank savings bank or savings association",
    71:"Credit union mortgage company or finance company",
    72:"Life insurance company",
    8:"Affiliate institution",
    9:"Other Type of purchaser"
})
hmda["preapproval"] = hmda["preapproval"].replace({
    1:"Preapproval requested",
    2:"Preapproval not requested"
})
hmda["loant_type"] = hmda["loan_type"].replace({
    1:"Conventional",
    2:"Federal Housing Administration FHA",
    3:"Veterans Affiars guaranteed VA",
    4:"USDA Rural Housing Service or the Farm Service Agency guaranteed RHS or FSA"
})
hmda["loan_purpose"] = hmda["loan_purpose"].replace({
    1:"Home purchase",
    2:"Home improvement",
    31:"Refinancing",
    32:"Cash-out refinancing",
    4:"Other purpose",
    5:"Not applicable",
})
hmda["lien_status"] = hmda["lien_status"].replace({
    1:"Secured by a first lien",
    2:"Secured by a subordinate lien"
})
hmda["hoepa_status"] = hmda["hoepa_status"].replace({
    1:"High-cost mortgage",
    2:"Not a high-cost mortgage",
    3:"Not applicable"
})
hmda["reverse_mortgage"] = hmda["reverse_mortgage"].replace({
    1:"Reverse mortgage",
    2:"Not a reverse mortgage",
    1111:"Exempt"
})
hmda["open-end_line_of_credit"] = hmda["open-end_line_of_credit"].replace({
    1:"Open-end line of credit",
    2:"Not an open-end line of credit",
    1111:"Exempt"
})
hmda["business_or_commercial_purpose"] = hmda["business_or_commercial_purpose"].replace({
    1:"Primarily for a business or commercial purpose",
    2:"Not primarily for a business or commercial purpose",
    1111:"Exempt"
})
hmda["negative_amortization"] = hmda["negative_amortization"].replace({
    1:"Negative amortization",
    2:"No negative amortization",
    1111:"Exempt"
})
hmda["interest_only_payment"] = hmda["interest_only_payment"].replace({
    1:"Interest-only payment",
    2:"No interest-only payments",
    1111:"Exempt"
})
hmda["balloon_payment"] = hmda["balloon_payment"].replace({
    1:"Balloon payment",
    2:"No balloon payment",
    1111:"Exempt"
})
hmda["other_nonamortizing_features"] = hmda["other_nonamortizing_features"].replace({
    1:"Other non-fully amortizing features",
    2:"No other non-fully amortizing features",
    1111:"Exempt"
})
hmda["construction_method"] = hmda["construction_method"].replace({
    1:"Site-built",
    2:"Manufactured Home",
    1111:"Exempt"
})
hmda["occupancy_type"] = hmda["occupancy_type"].replace({
    1:"Principal residence",
    2:"Second residence",
    3:"Investment Property"
})
hmda["applicant_credit_score_type"] = hmda["applicant_credit_score_type"].replace({
    1:"Equifax Beacon 5.0",
    2:"Experian Fair Isaac",
    3:"FICO Risk Score Classic 04",
    4:"FICO Risk Score Classic 98",
    5:"VintageScore 2.0",
    6:"VintageScore 3.0",
    7:"More than one credit scoring model",
    8:"Other credit scoring model",
    9:"Not Applicable"
})
hmda["co-applicant_credit_score_type"] = hmda["co-applicant_credit_score_type"].replace({
    1:"Equifax Beacon 5.0",
    2:"Experian Fair Isaac",
    3:"FICO Risk Score Classic 04",
    4:"FICO Risk Score Classic 98",
    5:"VintageScore 2.0",
    6:"VintageScore 3.0",
    7:"More than one credit scoring model",
    8:"Other credit scoring model",
    9:"Not Applicable",
    10:"No co-applicant"
})
#did not include eth-4 or 5
eths_to_modify = ["applicant_ethnicity-1","applicant_ethnicity-2","applicant_ethnicity-3",
            "co-applicant_ethnicity-1","co-applicant_ethnicity-2","co-applicant_ethnicity-3"]
eths_replace={
    1:"Hispanic or Latino",
    2:"Mexican",
    3:"Puerto Rican",
    4:"Cuban",
    5:"Other Hispanic or Latino",
    2:"Not Hispanic or Latino",
    3:"Info not provided by applicant in mail internet or telephone appplication",
    4:"Not Applicable"
}
for col in eths_to_modify:
    hmda[col] = hmda[col].replace(eths_replace)


race_to_modify = ["applicant_race-1","applicant_race-2","applicant_race-3",
                  "applicant_race-4","applicant_race-5",
                  "co-applicant_race-1","co-applicant_race-2","co-applicant_race-3",
                  "co-applicant_race-4","co-applicant_race-5"]
race_replace={
    1:"American Inidian or Alaska Native",
    2:"Asian",
    21:"Asian Indian",
    22:"Chinese",
    23:"Filipino",
    24:"Japanese",
    25:"Korean",
    26:"Vietnamese",
    27:"Other Asian",
    3:"Black or African American",
    4:"Native Hawaiian or Other Pacific Islander",
    41:"Native Hawaiian",
    42:"Guamanian",
    43:"Samoan",
    44:"Other Pacific Islander",
    5:"White",
    6:"Info not provided by applicant in mail internet or telephone appplication",
    7:"Not Applicable",
    8:"No co-applicant"
}
for col in race_to_modify:
    hmda[col] = hmda[col].replace(race_replace)

obs_to_modify = ["applicant_ethnicity_observed","co-applicant_ethnicity_observed","applicant_race_observed",
            "co-applicant_race_observed","applicant_sex_observed","co-applicant_sex_observed"]
obs_replace={
    1:"Collected on the basis of visual observation or surname",
    2:"Not collected on the basis of visual observation or surname",
    3:"Not applicable",
    4:"No co-applicant"
}
for col in obs_to_modify:
    hmda[col] = hmda[col].replace(obs_replace)
    
hmda["applicant_sex"] = hmda["applicant_sex"].replace({
    1:"Male",
    2:"Female",
    3:"Information not provided by applicant in mail internet or telephone application",
    4:"Not applicable",
    5:"No co-applicant",
    6:"Applicant selected both male and female"
})
hmda["co-applicant_sex"] = hmda["co-applicant_sex"].replace({
    1:"Male",
    2:"Female",
    3:"Information not provided by applicant in mail internet or telephone application",
    4:"Not applicable",
    5:"No co-applicant",
    6:"Applicant selected both male and female"
})
dens_to_modify = ["denial_reason-1","denial_reason-2","denial_reason-3"]
dens_replace={
    1:"Debt-to-income ratio",
    2:"Employment history",
    3:"Credit history",
    4:"Collateral",
    5:"Insufficient cash downpayment closing costs",
    6:"Unverifiable information",
    7:"Credit application incomplete",
    8:"Mortgage insurance denied",
    9:"Other",
    10:"Not applicable",
    1111:"Exempt"
}
for col in dens_to_modify:
    hmda[col] = hmda[col].replace(dens_replace)

hmda['income'] = hmda['income'].where(hmda['income'] >= 0, np.nan)

tumapping = {'1': 1, '2': 2, '3': 3, '4': 4,'5-24': 5.24, '25-49': 25.49, '50-99': 50.99, '100-149': 100.149, '>149': 149.9}
hmda["total_units"] = hmda["total_units"].replace(tumapping)# create a dictionary that maps the binned values to numerical values


debt_to_incomeratio_mapping = {'<20%': 0.1, '20%-<30%': 0.25, '30%-<36%': 0.32,"36":0.36, 
                               '37' :0.37, '38': 0.38, '39':0.39, '40':0.4, '41':0.41,
                               '42' :0.42, '43':0.43, '44':0.44, '45':0.45, '46':0.46,
                               '47':0.47, '48':0.48, '49':0.49, '50%-60%': 0.55, '>60%':0.65}

hmda["debt_to_income_ratio2"] = hmda["debt_to_income_ratio"].map(debt_to_incomeratio_mapping)

hmda[["debt_to_income_ratio2","debt_to_income_ratio"]].count()

aamapping = {'<25': 25, '25-34': 30, '35-44': 40, '45-54': 50,
             '55-64': 60, '65-74': 70, '>74': 78,}
hmda["applicant_age"] = hmda["applicant_age"].replace(aamapping)
hmda["co-applicant_age"] = hmda["co-applicant_age"].replace(aamapping)
hmda["co-applicant_age"].describe()

hmda = hmda.astype({
    'loan_to_value_ratio': float,
    'rate_spread': float,
    'interest_rate': float,
    'prepayment_penalty_term': float,
    'intro_rate_period': float,
    'property_value': float,
    'total_loan_costs': float,
    'total_points_and_fees': float,
    'origination_charges': float,
    'discount_points': float,
    'lender_credits': float,
    'loan_term': float,
    'prepayment_penalty_term': float
})

#todo use the output report to continue cleaning
#todo when done, save it:

hmda.to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

# hmda.to_csv('../input_data_clean/state_AZ-CA_clean.csv',index=False)
# hmda.sample(50000).to_csv('../input_data_clean/state_AZ-CA_clean_MINI.csv',index=False)

In [3]:
hmda

Unnamed: 0,activity_year,lei,derived_msa-md,state_code,county_code,census_tract,conforming_loan_limit,derived_loan_product_type,derived_dwelling_category,derived_ethnicity,...,tract_minority_population_percent,ffiec_msa_md_median_family_income,tract_to_msa_income_percentage,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units,simplified_applicant_age_above_62,state_simplified,loant_type,debt_to_income_ratio2
1,2021,7H6GLXDRUGQFU57RNE97,46060.0,AZ,4019.0,4.019004e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,0.79,68600,109,1851.0,3048.0,29,0.0,1,Conventional,0.32
3,2021,549300YIQ7S7Z8PIHE53,,CA,6109.0,6.109005e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,11.89,69700,91,2597.0,3593.0,35,0.0,0,Conventional,0.10
4,2021,549300MGPZBLQDIL7538,40900.0,CA,6067.0,6.067002e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Joint,...,63.01,90900,58,146.0,937.0,68,0.0,0,Conventional,0.41
5,2021,549300QPD4M26VIOFU53,31084.0,CA,6037.0,6.037554e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,85.37,80000,83,142.0,491.0,48,0.0,0,Conventional,0.44
7,2021,549300AG64NHILB7ZP05,31084.0,CA,6037.0,6.037430e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,68.84,80000,97,653.0,998.0,43,0.0,0,Conventional,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49994,2021,549300VORTI31GZTJL53,31084.0,CA,6037.0,6.037599e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Joint,...,72.08,80000,100,294.0,1462.0,45,0.0,0,Conventional,0.10
49995,2021,254900VHUBSJJKOMBF27,40140.0,CA,6065.0,6.065043e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,46.06,77500,139,2081.0,2584.0,10,0.0,0,Conventional,0.36
49996,2021,549300SK2GVCQXPD4S58,38060.0,AZ,4013.0,4.013818e+09,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,54.11,79000,116,221.0,365.0,20,0.0,1,Conventional,0.38
49997,2021,549300O6Z0I6KYMESL47,31084.0,CA,6037.0,6.037910e+09,NC,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,45.44,80000,154,1504.0,1748.0,30,0.0,0,Conventional,0.39


In [39]:
counts = hmda.groupby('lei')['action_taken'].value_counts()

In [40]:
counts

lei
01KWVG908KE7RKPTNP46      7
0K2D5AK28E3O5CC06E35      3
0S8H5NJFLHEVJXVTQ413      3
1IE8VN30JCEQV1H4R804     23
207ALC1P1YM0OVDV0K75      2
                       ... 
WKN6AF1FCL7BBYGTGI83     14
WWB2V0FCW3A0EE3ZJN75      5
X05BVSK68TQ7YTOSNR22    104
YWC0TIKBQM2JV8L4IV08    226
ZXMJHJK466PBZTM5F379      2
Name: action_taken, Length: 731, dtype: int64

In [41]:
originated_counts = counts.loc[(slice(None), 'Loan Originated')]

IndexingError: Too many indexers

In [42]:
lei1_count_range = (0.5 * originated_counts['QOT5WN9RBKQTFRVKEV31'], 2 * originated_counts['QOT5WN9RBKQTFRVKEV31'])

In [43]:
filtered_counts = originated_counts[(lei1_count_range[0] <= originated_counts) & (originated_counts <= lei1_count_range[1])]

In [44]:
filtered_counts

lei
254900TTZ395IC926125    135
254900UL88QFG0E40516    101
254900VHUBSJJKOMBF27    130
254900ZFWS2106HWPH46    164
549300121SF0K2LN2804    130
5493001NJEVHTZW7FG34    161
5493001WHVQBGRSWEU75    228
5493003GQDUH26DNNH17    150
54930043BMDE130FJ617    115
5493004AS1SPBQOFDR49    143
5493005FJKWE0GR4YS94    109
5493006F1N8E3EEF3W63    200
5493008NWHQT1R22C024    135
5493008VVXQIDO1EZ460    152
549300AQ3T62GXDU7D76    315
549300DD4R4SYK5RAQ92    161
549300DD5QQUHO6PCH70    118
549300E2UX99HKDBR481    344
549300FNXYY540N23N64    390
549300GKFNPRWNS0GF29    144
549300GQDT484LGI9C04    163
549300HFXTV55C2HHM89    104
549300HIVO8XPBPNVG69    259
549300J7I82PNDVU8H22    114
549300J7XKT2BI5WX213    398
549300KIOYNU323LVJ37    345
549300LBCBNR1OT00651    391
549300NOCASXPA34X033    283
549300O6Z0I6KYMESL47    180
549300PC4MFWQBNVKG88    134
549300PIL8LFAQ04XC20    101
549300SK2GVCQXPD4S58    146
549300VORTI31GZTJL53    356
549300VZVN841I2ILS84    272
549300WTZMQSET2VY242    112
549300XQVJ1XBNFA

In [47]:
list1 = filtered_counts.index.tolist()