In [1]:
from collections import OrderedDict
import json
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import psycopg2

from lib.data_tools import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def get_sql_results_df(table, sql, cur):
    sql=sql.format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

def compile_dfs(tables, outfile, sql):
    cur=connect()
    first = True
    for table in tables:
        year = table[4:8]
        data_df = get_sql_results_df(table, sql, cur)
        data_df["year"] = year
        print(data_df.head())
        if first:
            first = False
            data_out_df = data_df.copy()
        else:
            data_out_df = pd.concat([data_out_df, data_df])
    data_out_df.to_csv("../output/"+outfile+".csv", sep="|", index=False)
    cur.close()
    return data_out_df
    

In [2]:
#list of public HMDA data tables in PG
lar_tables = ["lar_2004_ffiec", "lar_2005_ffiec", "lar_2006_ffiec", "lar_2007_ffiec", "lar_2008_ffiec", "lar_2009_ffiec"
         ,"lar_2010_ffiec", "lar_2011_ffiec", "lar_2012_ffiec", "lar_2013_ffiec", "lar_2014_ffiec", "lar_2015_ffiec",
         "lar_2016_ffiec"]

In [None]:
#Set up filters (where clauses) to build aggregate data 
#sf = single family, purch = purchase loan, conv = conventional loan 
#occ = owner occupied, first = first lien
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
#sf = single family, purch = purchase loan, conv = conventional loan, 
#occ = owner occupied, junior = junior lien
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"

sf_purch_conv_occ_first_amt_df = pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                            where=sf_purch_conv_occ_first))
sf_purch_conv_occ_first_amt_df.to_csv("../output/sf_purch_conv_occ_first_amt.csv", sep="|", index=False)


sf_purch_conv_occ_junior_amt_df=pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                           where=sf_purch_conv_occ_junior))

sf_purch_conv_occ_junior_amt_df.to_csv("../output/sf_purch_conv_occ_junior_amt.csv", sep="|", index=False)

pulling metrics for: lar_2004_ffiec


In [4]:
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"
inc_na_where = " AND income NOT LIKE '%NA%' AND income not like '%na%' "
sf_purch_conv_occ_first_inc_df = pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                    where=sf_purch_conv_occ_first+inc_na_where))

sf_purch_conv_occ_first_inc_df.to_csv("../output/sf_purch_conv_occ_first_inc.csv", sep="|", index=False)

sf_purch_conv_occ_junior_inc_df=pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                   where=sf_purch_conv_occ_junior + inc_na_where))
sf_purch_conv_occ_junior_inc_df.to_csv("../output/sf_purch_conv_occ_junior_inc.csv", sep="|", index=False)


pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec
pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec


In [None]:
#action frequency, percent, cumulative percent
cur = connect()
def get_action_frequency(table):
    sql = """
    SELECT year, action
    ,(ROUND(COUNT(action) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,(CASE WHEN action='1' THEN COUNT(action) 
    WHEN action='2' THEN COUNT(action)  
    WHEN action='3' THEN COUNT(action) 
    WHEN action='4' THEN COUNT(action) 
    WHEN action='5' THEN COUNT(action) 
    WHEN action='6' THEN COUNT(action) 
    WHEN action='7' THEN COUNT(action)  
    WHEN action='8' THEN COUNT(action) END) AS action_count
    FROM {table}
    GROUP BY year, action
    ORDER BY action""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    action_df = get_action_frequency(table)
    print(action_df.head())
    if first:
        first = False
        action_freq_df = action_df.copy()
    else:
        action_freq_df = pd.concat([action_freq_df, action_df])
action_freq_df.to_csv("../output/action_freq.csv", sep="|", index=False)

In [None]:
#Race/Action matrix counts
cur=connect()
def get_action_race_tab(table):

    sql = """--CREATE EXTENSION tablefunc;
   SELECT * 
    FROM crosstab( 
    'SELECT action, race_1, count(race_1)
    from {table} group by 1,2 order by 1,2')  
    AS final_result(
    "action_type" varchar, 
    "native" bigint, "asian" bigint, "black" bigint, "islander" bigint, "white" bigint, 
    "no_info" bigint,"NA" bigint)""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    year = table[4:8]
    action_race_df = get_action_race_tab(table)
    action_race_df["year"] = year
    print(action_race_df.head())
    if first:
        first = False
        action_race_freq_df = action_race_df.copy()
    else:
        action_race_freq_df = pd.concat([action_race_freq_df, action_race_df])
action_race_freq_df.to_csv("../output/action_race_freq_df.csv", sep="|", index=False)
    

In [6]:
#Race/Action matrix counts
#Could group by MSA and filter lenders for highest delta from race pattern in MSA

    
sql = """
        --CREATE EXTENSION tablefunc;
        SELECT * 
        FROM crosstab( 
        'SELECT action, race_1, COUNT(race_1)
        FROM {table} GROUP BY 1,2 ORDER BY 1,2')  

    AS final_result(
    "action" varchar, 
    "native" bigint, "asian" bigint, "black" bigint, "islander" bigint, "white" bigint, "no_info" bigint
    ,"NA" bigint)"""

race_action_crosstab_df = compile_dfs(lar_tables, "race_action_crosstab", sql)

  action  native   asian    black  islander     white  no_info      NA  year
0      1  140056  615750  1124835     91750  10934368  1957773  175044  2004
1      2   26445  104673   245225     16263   1452744   506430   35101  2004
2      3   86952  158821   768290     38793   3404770  1546070   36248  2004
3      4   39364   98713   310730     25007   1969340  1148152   27660  2004
4      5   10482   28601    90611      5250    571880   348372   10053  2004
  action  native   asian    black  islander     white  no_info     NA  year
0      1  134501  632885  1294412     99835  11530293  1855763  90607  2005
1      2   24169  108135   255840     17915   1484225   473689   3359  2005
2      3   86641  195822   921086     45584   3918036  1525795   3934  2005
3      4   58794  117950   418745     26779   2384891  1280282   7897  2005
4      5    9840   31847    99027      5910    630546   416815    873  2005
  action  native   asian    black  islander     white  no_info     NA  year
0     

In [None]:
#applicant sex frequency, percent, cumulative percent

sql="""    SELECT year, app_sex
    ,(ROUND(COUNT(app_sex) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(app_sex)
    FROM {table}
    GROUP BY year, app_sex
    ORDER BY app_sex"""

app_sex_count_df = compile_dfs(lar_tables, "app_sex_count", sql)


   year app_sex freq_pct     count
0  2004       1       59  19856343
1  2004       2       27   9015409
2  2004       3        6   2120023
3  2004       4        8   2638699
   year app_sex freq_pct     count
0  2005       1       57  20951405
1  2005       2       28  10180812
2  2005       3        7   2434585
3  2005       4        8   2890434
   year app_sex freq_pct     count
0  2006       1       54  18368256
1  2006       2       28   9522450
2  2006       3        7   2449426
3  2006       4       11   3815228
   year app_sex freq_pct     count
0  2007       1       56  15069150
1  2007       2       28   7499350
2  2007       3        7   1826881
3  2007       4        9   2306711
   year app_sex freq_pct     count
0  2008       1       59  10340093
1  2008       2       27   4760731
2  2008       3        6   1119714
3  2008       4        7   1310702
   year app_sex freq_pct     count
0  2009       1       59  11595453
1  2009       2       24   4728230
2  2009       3     

In [None]:
sql="""  SELECT year, co_app_sex
    ,(ROUND(COUNT(co_app_sex) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(co_app_sex)
    FROM {table}
    GROUP BY year, co_app_sex
    ORDER BY co_app_sex"""
co_app_sex_count_df = compile_dfs(lar_tables, "co_app_sex_count", sql)


   year co_app_sex freq_pct     count
0  2004          1        7   2433713
1  2004          2       32  10731439
2  2004          3        3   1135033
3  2004          4        8   2764622
4  2004          5       49  16565667
   year co_app_sex freq_pct     count
0  2005          1        7   2717919
1  2005          2       29  10435613
2  2005          3        3   1183541
3  2005          4        8   2815036
4  2005          5       53  19305127


In [None]:
sql = """SELECT year, race_1
    ,(ROUND(COUNT(race_1) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(race_1)
    FROM {table}
    GROUP BY year, race_1
    ORDER BY race_1"""
race_freq_df = compile_dfs(lar_tables, "race_freq", sql)

In [None]:
sql = """SELECT year, app_eth
    ,(ROUND(COUNT(app_eth) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(app_eth)
 
    FROM {table}
    WHERE race_1='5'
    GROUP BY year, app_eth
    ORDER BY app_eth"""
eth_freq_df = compile_dfs(lar_tables, "eth_freq", sql)

In [None]:
sql=""" SELECT year, purchaser
    ,(ROUND(COUNT(purchaser) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(purchaser)
    FROM {table}
    WHERE loan_type='1'
    GROUP BY year, purchaser
    ORDER BY purchaser"""
purchaser_freq_df = compile_dfs(lar_tables, "purch_freq_conv", sql)

In [None]:
#purchaser type for loan_type>1, aciton=1: count, percent
sql = """
    SELECT year, purchaser
    ,(COUNT(purchaser) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    FROM {table}
    WHERE loan_type!='1'
    GROUP BY year, purchaser
    ORDER BY purchaser"""
purchaser_not_conv_df = compile_dfs(lar_tables, "purch_not_conv", sql)

In [None]:
#inst name for lien=1, loantype=1, propertytype=1, spread>=15: frequency, percent
sql = """
    SELECT year, CONCAT(agency, rid)
    ,(COUNT(CONCAT(agency, rid)) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    ,COUNT(CONCAT(agency, rid))

    FROM {table}
    WHERE loan_type='1' AND lien = '1' AND property_type = '1' AND 
    CAST(rate_spread AS FLOAT) >= 15 AND rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '% %'
    GROUP BY year, CONCAT(agency, rid)"""

rate_spread_conv_df = compile_dfs(lar_tables, "rate_spread_conv_1st", sql)

In [None]:
#inst name for lien=1, loantype=1, propertytype=2, spread>=15: frequency, percent
sql = """ SELECT year, CONCAT(agency, rid)
    ,(COUNT(CONCAT(agency, rid)) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    ,COUNT(CONCAT(agency, rid))

    FROM {table}
    WHERE loan_type='1' AND lien = '1' AND property_type = '2' AND CAST(rate_spread AS FLOAT) >= 15 AND rate_spread NOT LIKE '%NA%'
    GROUP BY year, CONCAT(agency, rid)"""
rate_spread_manu_df = compile_dfs(lar_tables, "rate_spread_manu_1st", sql)

In [None]:
sql = """
    SELECT year, denial_1
    ,(COUNT(denial_1)) *100.0/ (SELECT COUNT(*) FROM {table} WHERE action='3' AND loan_type='1') AS freq_pct
    FROM {table}
    WHERE loan_type='1' AND action='3'
    GROUP BY year, denial_1
"""
denial_freq_df = compile_dfs(lar_tables, "denial_freq", sql)