In [1]:
from collections import OrderedDict
import json
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import psycopg2

from lib.data_tools import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
#list of public HMDA data tables in PG
lar_tables = ["lar_2004_ffiec", "lar_2005_ffiec", "lar_2006_ffiec", "lar_2007_ffiec", "lar_2008_ffiec", "lar_2009_ffiec"
         ,"lar_2010_ffiec", "lar_2011_ffiec", "lar_2012_ffiec", "lar_2013_ffiec", "lar_2014_ffiec", "lar_2015_ffiec",
         "lar_2016_ffiec"]

In [None]:
#Set up filters (where clauses) to build aggregate data 
#sf = single family, purch = purchase loan, conv = conventional loan 
#occ = owner occupied, first = first lien
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
#sf = single family, purch = purchase loan, conv = conventional loan, 
#occ = owner occupied, junior = junior lien
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"

sf_purch_conv_occ_first_amt_df = pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                            where=sf_purch_conv_occ_first))
sf_purch_conv_occ_first_amt_df.to_csv("../output/sf_purch_conv_occ_first_amt.csv", sep="|", index=False)


sf_purch_conv_occ_junior_amt_df=pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                           where=sf_purch_conv_occ_junior))

sf_purch_conv_occ_junior_amt_df.to_csv("../output/sf_purch_conv_occ_junior_amt.csv", sep="|", index=False)

pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec
pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec


In [None]:
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"
inc_na_where = " AND income NOT LIKE '%NA%' AND income not like '%na%' "
sf_purch_conv_occ_first_inc_df = pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                    where=sf_purch_conv_occ_first+inc_na_where))

sf_purch_conv_occ_first_inc_df.to_csv("../output/sf_purch_conv_occ_first_inc.csv", sep="|", index=False)

sf_purch_conv_occ_junior_inc_df=pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                   where=sf_purch_conv_occ_junior + inc_na_where))
sf_purch_conv_occ_junior_inc_df.to_csv("../output/sf_purch_conv_occ_junior_inc.csv", sep="|", index=False)


pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec
pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec


In [None]:
#action frequency, percent, cumulative percent
cur = connect()
def get_action_frequency(table):
    sql = """
    SELECT year, action
    ,(ROUND(COUNT(action) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,(CASE WHEN action='1' THEN COUNT(action) 
    WHEN action='2' THEN COUNT(action)  
    WHEN action='3' THEN COUNT(action) 
    WHEN action='4' THEN COUNT(action) 
    WHEN action='5' THEN COUNT(action) 
    WHEN action='6' THEN COUNT(action) 
    WHEN action='7' THEN COUNT(action)  
    WHEN action='8' THEN COUNT(action) END) AS action_count
    FROM {table}
    GROUP BY year, action
    ORDER BY action""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    action_df = get_action_frequency(table)
    print(action_df.head())
    if first:
        first = False
        action_freq_df = action_df.copy()
    else:
        action_freq_df = pd.concat([action_freq_df, action_df])
action_freq_df.to_csv("../output/action_freq.csv", sep="|", index=False)

In [None]:
#Race/Action matrix counts
cur=connect()
def get_action_race_tab(table):

    sql = """--CREATE EXTENSION tablefunc;
   SELECT * 
    FROM crosstab( 
    'SELECT action, race_1, count(race_1)
    from {table} group by 1,2 order by 1,2')  
    AS final_result(
    "action_type" varchar, 
    "native" bigint, "asian" bigint, "black" bigint, "islander" bigint, "white" bigint, 
    "no_info" bigint,"NA" bigint)""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    year = table[4:8]
    action_race_df = get_action_race_tab(table)
    action_race_df["year"] = year
    print(action_race_df.head())
    if first:
        first = False
        action_race_freq_df = action_race_df.copy()
    else:
        action_race_freq_df = pd.concat([action_race_freq_df, action_race_df])
action_race_freq_df.to_csv("../output/action_race_freq_df.csv", sep="|", index=False)
    