In [10]:
from collections import OrderedDict
import json
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import psycopg2

from lib.data_tools import *
from lib.data_tools import get_sql_results_df, compile_dfs

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

def get_sql_results_df(table, sql, cur, table2=None):
    if not table2:
        sql=sql.format(table=table)
    else:
        table2 = table[:4] + str(int(table[4:8])-1) + "_ffiec"
        sql=sql.format(table=table, table2=table2)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

def compile_dfs(tables, outfile, sql, table2=None):
    cur=connect()
    first = True
    for table in tables:
        year = table[4:8]
        data_df = get_sql_results_df(table, sql, cur, table2)
        data_df["year"] = year
        print(data_df.head())
        if first:
            first = False
            data_out_df = data_df.copy()
        else:
            data_out_df = pd.concat([data_out_df, data_df])
    data_out_df.to_csv("../output/"+outfile+".csv", sep="|", index=False)
    cur.close()
    return data_out_df

In [12]:
#list of public HMDA data tables in PG
lar_tables = ["lar_2004_ffiec", "lar_2005_ffiec", "lar_2006_ffiec", "lar_2007_ffiec", "lar_2008_ffiec", "lar_2009_ffiec"
         ,"lar_2010_ffiec", "lar_2011_ffiec", "lar_2012_ffiec", "lar_2013_ffiec", "lar_2014_ffiec", "lar_2015_ffiec",
         "lar_2016_ffiec"]

In [4]:
#Set up filters (where clauses) to build aggregate data 
#sf = single family, purch = purchase loan, conv = conventional loan 
#occ = owner occupied, first = first lien
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
#sf = single family, purch = purchase loan, conv = conventional loan, 
#occ = owner occupied, junior = junior lien
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"

sf_purch_conv_occ_first_amt_df = pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                            where=sf_purch_conv_occ_first))
sf_purch_conv_occ_first_amt_df.to_csv("../output/sf_purch_conv_occ_first_amt.csv", sep="|", index=False)


sf_purch_conv_occ_junior_amt_df=pd.DataFrame(get_lar_stats("amount", lar_tables, 
                                                           where=sf_purch_conv_occ_junior))

sf_purch_conv_occ_junior_amt_df.to_csv("../output/sf_purch_conv_occ_junior_amt.csv", sep="|", index=False)

pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec
pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec


In [5]:
sf_purch_conv_occ_first = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='1'"
sf_purch_conv_occ_junior = "WHERE property_type='1' AND purpose='1' AND loan_type='1' AND occupancy='1' AND lien='2'"
inc_na_where = " AND income NOT LIKE '%NA%' AND income not like '%na%' "
sf_purch_conv_occ_first_inc_df = pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                    where=sf_purch_conv_occ_first+inc_na_where))

sf_purch_conv_occ_first_inc_df.to_csv("../output/sf_purch_conv_occ_first_inc.csv", sep="|", index=False)

sf_purch_conv_occ_junior_inc_df=pd.DataFrame(get_lar_stats("income", lar_tables, 
                                                   where=sf_purch_conv_occ_junior + inc_na_where))
sf_purch_conv_occ_junior_inc_df.to_csv("../output/sf_purch_conv_occ_junior_inc.csv", sep="|", index=False)


pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec
pulling metrics for: lar_2004_ffiec
pulling metrics for: lar_2005_ffiec
pulling metrics for: lar_2006_ffiec
pulling metrics for: lar_2007_ffiec
pulling metrics for: lar_2008_ffiec
pulling metrics for: lar_2009_ffiec
pulling metrics for: lar_2010_ffiec
pulling metrics for: lar_2011_ffiec
pulling metrics for: lar_2012_ffiec
pulling metrics for: lar_2013_ffiec
pulling metrics for: lar_2014_ffiec
pulling metrics for: lar_2015_ffiec
pulling metrics for: lar_2016_ffiec


In [6]:
#action frequency, percent, cumulative percent
cur = connect()
def get_action_frequency(table):
    sql = """
    SELECT year, action
    ,(ROUND(COUNT(action) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,(CASE WHEN action='1' THEN COUNT(action) 
    WHEN action='2' THEN COUNT(action)  
    WHEN action='3' THEN COUNT(action) 
    WHEN action='4' THEN COUNT(action) 
    WHEN action='5' THEN COUNT(action) 
    WHEN action='6' THEN COUNT(action) 
    WHEN action='7' THEN COUNT(action)  
    WHEN action='8' THEN COUNT(action) END) AS action_count
    FROM {table}
    GROUP BY year, action
    ORDER BY action""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    action_df = get_action_frequency(table)
    print(action_df.head())
    if first:
        first = False
        action_freq_df = action_df.copy()
    else:
        action_freq_df = pd.concat([action_freq_df, action_df])
action_freq_df.to_csv("../output/action_freq.csv", sep="|", index=False)

   year action freq_pct  action_count
0  2004      1       45      15039576
1  2004      2        7       2386881
2  2004      3       18       6039944
3  2004      4       11       3618966
4  2004      5        3       1065249
   year action freq_pct  action_count
0  2005      1       43      15638296
1  2005      2        6       2367332
2  2005      3       18       6696898
3  2005      4       12       4295338
4  2005      5        3       1194858
   year action freq_pct  action_count
0  2006      1       41      14011262
1  2006      2        7       2411194
2  2006      3       19       6576564
3  2006      4       10       3564168
4  2006      5        3        944686
   year action freq_pct  action_count
0  2007      1       39      10480107
1  2007      2        7       1922006
2  2007      3       22       5962923
3  2007      4        9       2335744
4  2007      5        3        746999
   year action freq_pct  action_count
0  2008      1       41       7234207
1  2008     

In [7]:
#Race/Action matrix counts
cur=connect()
def get_action_race_tab(table):

    sql = """--CREATE EXTENSION tablefunc;
   SELECT * 
    FROM crosstab( 
    'SELECT action, race_1, count(race_1)
    from {table} group by 1,2 order by 1,2')  
    AS final_result(
    "action_type" varchar, 
    "native" bigint, "asian" bigint, "black" bigint, "islander" bigint, "white" bigint, 
    "no_info" bigint,"NA" bigint)""".format(table=table)
    cur.execute(sql)
    colnames = [desc[0] for desc in cur.description]
    data_df = pd.DataFrame(cur.fetchall(), columns=colnames)
    return data_df

first = True
for table in lar_tables:
    year = table[4:8]
    action_race_df = get_action_race_tab(table)
    action_race_df["year"] = year
    print(action_race_df.head())
    if first:
        first = False
        action_race_freq_df = action_race_df.copy()
    else:
        action_race_freq_df = pd.concat([action_race_freq_df, action_race_df])
action_race_freq_df.to_csv("../output/action_race_freq_df.csv", sep="|", index=False)
    

  action_type  native   asian    black  islander     white  no_info      NA  \
0           1  140056  615750  1124835     91750  10934368  1957773  175044   
1           2   26445  104673   245225     16263   1452744   506430   35101   
2           3   86952  158821   768290     38793   3404770  1546070   36248   
3           4   39364   98713   310730     25007   1969340  1148152   27660   
4           5   10482   28601    90611      5250    571880   348372   10053   

   year  
0  2004  
1  2004  
2  2004  
3  2004  
4  2004  
  action_type  native   asian    black  islander     white  no_info     NA  \
0           1  134501  632885  1294412     99835  11530293  1855763  90607   
1           2   24169  108135   255840     17915   1484225   473689   3359   
2           3   86641  195822   921086     45584   3918036  1525795   3934   
3           4   58794  117950   418745     26779   2384891  1280282   7897   
4           5    9840   31847    99027      5910    630546   416815    873 

In [8]:
#Race/Action matrix counts
#Could group by MSA and filter lenders for highest delta from race pattern in MSA

    
sql = """
        --CREATE EXTENSION tablefunc;
        SELECT * 
        FROM crosstab( 
        'SELECT action, race_1, COUNT(race_1)
        FROM {table} GROUP BY 1,2 ORDER BY 1,2')  

    AS final_result(
    "action" varchar, 
    "native" bigint, "asian" bigint, "black" bigint, "islander" bigint, "white" bigint, "no_info" bigint
    ,"NA" bigint)"""

race_action_crosstab_df = compile_dfs(lar_tables, "race_action_crosstab", sql)

  action  native   asian    black  islander     white  no_info      NA  year
0      1  140056  615750  1124835     91750  10934368  1957773  175044  2004
1      2   26445  104673   245225     16263   1452744   506430   35101  2004
2      3   86952  158821   768290     38793   3404770  1546070   36248  2004
3      4   39364   98713   310730     25007   1969340  1148152   27660  2004
4      5   10482   28601    90611      5250    571880   348372   10053  2004
  action  native   asian    black  islander     white  no_info     NA  year
0      1  134501  632885  1294412     99835  11530293  1855763  90607  2005
1      2   24169  108135   255840     17915   1484225   473689   3359  2005
2      3   86641  195822   921086     45584   3918036  1525795   3934  2005
3      4   58794  117950   418745     26779   2384891  1280282   7897  2005
4      5    9840   31847    99027      5910    630546   416815    873  2005
  action  native   asian    black  islander     white  no_info     NA  year
0     

In [9]:
#applicant sex frequency, percent, cumulative percent

sql="""    SELECT year, app_sex
    ,(ROUND(COUNT(app_sex) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(app_sex)
    FROM {table}
    GROUP BY year, app_sex
    ORDER BY app_sex"""

app_sex_count_df = compile_dfs(lar_tables, "app_sex_count", sql)


   year app_sex freq_pct     count
0  2004       1       59  19856343
1  2004       2       27   9015409
2  2004       3        6   2120023
3  2004       4        8   2638699
   year app_sex freq_pct     count
0  2005       1       57  20951405
1  2005       2       28  10180812
2  2005       3        7   2434585
3  2005       4        8   2890434
   year app_sex freq_pct     count
0  2006       1       54  18368256
1  2006       2       28   9522450
2  2006       3        7   2449426
3  2006       4       11   3815228
   year app_sex freq_pct     count
0  2007       1       56  15069150
1  2007       2       28   7499350
2  2007       3        7   1826881
3  2007       4        9   2306711
   year app_sex freq_pct     count
0  2008       1       59  10340093
1  2008       2       27   4760731
2  2008       3        6   1119714
3  2008       4        7   1310702
   year app_sex freq_pct     count
0  2009       1       59  11595453
1  2009       2       24   4728230
2  2009       3     

In [10]:
sql="""  SELECT year, co_app_sex
    ,(ROUND(COUNT(co_app_sex) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(co_app_sex)
    FROM {table}
    GROUP BY year, co_app_sex
    ORDER BY co_app_sex"""
co_app_sex_count_df = compile_dfs(lar_tables, "co_app_sex_count", sql)


   year co_app_sex freq_pct     count
0  2004          1        7   2433713
1  2004          2       32  10731439
2  2004          3        3   1135033
3  2004          4        8   2764622
4  2004          5       49  16565667
   year co_app_sex freq_pct     count
0  2005          1        7   2717919
1  2005          2       29  10435613
2  2005          3        3   1183541
3  2005          4        8   2815036
4  2005          5       53  19305127
   year co_app_sex freq_pct     count
0  2006          1        7   2460981
1  2006          2       25   8552549
2  2006          3        4   1270232
3  2006          4       10   3421646
4  2006          5       54  18449952
   year co_app_sex freq_pct     count
0  2007          1        8   2091662
1  2007          2       28   7433801
2  2007          3        3    933409
3  2007          4        7   1810464
4  2007          5       54  14432756
   year co_app_sex freq_pct    count
0  2008          1        8  1487998
1  2008       

In [11]:
sql = """SELECT year, race_1
    ,(ROUND(COUNT(race_1) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(race_1)
    FROM {table}
    GROUP BY year, race_1
    ORDER BY race_1"""
race_freq_df = compile_dfs(lar_tables, "race_freq", sql)

   year race_1 freq_pct     count
0  2004      1        1    328084
1  2004      2        3   1133243
2  2004      3        8   2764040
3  2004      4        1    196331
4  2004      5       61  20351133
   year race_1 freq_pct     count
0  2005      1        1    343006
1  2005      2        3   1243237
2  2005      3        9   3263277
3  2005      4        1    218775
4  2005      5       62  22461104
   year race_1 freq_pct     count
0  2006      1        1    277430
1  2006      2        3   1100355
2  2006      3       10   3296935
3  2006      4        1    211957
4  2006      5       59  20238822
   year race_1 freq_pct     count
0  2007      1        1    229957
1  2007      2        3    896763
2  2007      3        9   2471941
3  2007      4        1    145498
4  2007      5       64  16964653
   year race_1 freq_pct     count
0  2008      1        1    142367
1  2008      2        4    633542
2  2008      3        8   1410143
3  2008      4        0     84438
4  2008      5

In [12]:
sql = """SELECT year, app_eth
    ,(ROUND(COUNT(app_eth) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(app_eth)
 
    FROM {table}
    WHERE race_1='5'
    GROUP BY year, app_eth
    ORDER BY app_eth"""
eth_freq_df = compile_dfs(lar_tables, "eth_freq", sql)

   year app_eth freq_pct     count
0  2004       1        6   2149601
1  2004       2       47  15871035
2  2004       3        3    905144
3  2004       4        4   1425353
   year app_eth freq_pct     count
0  2005       1        9   3256402
1  2005       2       51  18466941
2  2005       3        2    667167
3  2005       4        0     70594
   year app_eth freq_pct     count
0  2006       1       10   3422984
1  2006       2       48  16370968
2  2006       3        1    406375
3  2006       4        0     38495
   year app_eth freq_pct     count
0  2007       1        9   2493161
1  2007       2       53  14177951
2  2007       3        1    289297
3  2007       4        0      4244
   year app_eth freq_pct     count
0  2008       1        8   1364997
1  2008       2       59  10364588
2  2008       3        1    168244
3  2008       4        0      1492
   year app_eth freq_pct     count
0  2009       1        6   1100150
1  2009       2       62  12120047
2  2009       3     

In [13]:
sql=""" SELECT year, purchaser
    ,(ROUND(COUNT(purchaser) *100.0/ (SELECT COUNT(*) FROM {table}) )) AS freq_pct
    ,COUNT(purchaser)
    FROM {table}
    WHERE loan_type='1'
    GROUP BY year, purchaser
    ORDER BY purchaser"""
purchaser_freq_df = compile_dfs(lar_tables, "purch_freq_conv", sql)

   year purchaser freq_pct     count
0  2004         0       56  18920361
1  2004         1        9   3084973
2  2004         3        6   1870443
3  2004         4        0       227
4  2004         5        2    699346
   year purchaser freq_pct     count
0  2005         0       58  21054927
1  2005         1        7   2490477
2  2005         3        4   1602956
3  2005         4        0       487
4  2005         5        5   1843428
   year purchaser freq_pct     count
0  2006         0       59  20011670
1  2006         1        6   2211935
2  2006         3        4   1358151
3  2006         4        0       269
4  2006         5        5   1740088
   year purchaser freq_pct     count
0  2007         0       60  15947076
1  2007         1        9   2403593
2  2007         3        6   1530757
3  2007         4        0       227
4  2007         5        2    655258
   year purchaser freq_pct    count
0  2008         0       51  8859581
1  2008         1       10  1835136
2  2

In [14]:
#purchaser type for loan_type>1, aciton=1: count, percent
sql = """
    SELECT year, purchaser
    ,(COUNT(purchaser) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    FROM {table}
    WHERE loan_type!='1'
    GROUP BY year, purchaser
    ORDER BY purchaser"""
purchaser_not_conv_df = compile_dfs(lar_tables, "purch_not_conv", sql)

   year purchaser                freq_pct
0  2004         0      1.7896238988484075
1  2004         1  0.01650288961136854628
2  2004         2      2.4720466324679218
3  2004         3  0.00415694408589067166
4  2004         4  0.00119237094309167334
   year purchaser                freq_pct
0  2005         0      1.3133497010031150
1  2005         1  0.01021196450548253301
2  2005         2      1.3299390003125854
3  2005         3  0.00241652987626379575
4  2005         4  0.00070767844276510704
   year purchaser                freq_pct
0  2006         0      1.1793112413395731
1  2006         1  0.00780258208374908067
2  2006         2      1.2723683779061324
3  2006         3  0.00341966824533543198
4  2006         4  0.00093689540968094027
   year purchaser                freq_pct
0  2007         0      2.0459370748928586
1  2007         1  0.01393523773343302090
2  2007         2      1.7640677741654100
3  2007         3  0.00520558464108355255
4  2007         4  0.0005430286136

In [15]:
#inst name for lien=1, loantype=1, propertytype=1, spread>=15: frequency, percent
sql = """
    SELECT year, CONCAT(agency, rid)
    ,(COUNT(CONCAT(agency, rid)) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    ,COUNT(CONCAT(agency, rid))

    FROM {table}
    WHERE loan_type='1' AND lien = '1' AND property_type = '1' AND 
    CAST(rate_spread AS FLOAT) >= 15 AND rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '% %'
    GROUP BY year, CONCAT(agency, rid)"""

rate_spread_conv_df = compile_dfs(lar_tables, "rate_spread_conv_1st", sql)

   year       concat                    freq_pct  count
0  2004  30000011813  0.000011893974494680033353      4
1  2004  50000008323  0.000005946987247340016677      2
2  2004  71738100008      0.00013975420031249039     47
3  2004  30000015728  0.000002973493623670008338      1
4  2004  30000005702  0.000002973493623670008338      1
   year       concat                    freq_pct  count
0  2005  20001993734  0.000013714698503199748878      5
1  2005  756-2148606  0.000002742939700639949776      1
2  2005  20000139843  0.000002742939700639949776      1
3  2005  20000659855  0.000005485879401279899551      2
4  2005  10000017927  0.000010971758802559799103      4
   year       concat                    freq_pct  count
0  2006  77271100003  0.000008783394465758815015      3
1  2006  701-0550261  0.000017566788931517630029      6
2  2006  50000006160  0.000008783394465758815015      3
3  2006  77479600006  0.000002927798155252938338      1
4  2006  20001445943  0.000002927798155252938338

In [16]:
#inst name for lien=1, loantype=1, propertytype=2, spread>=15: frequency, percent
sql = """ SELECT year, CONCAT(agency, rid)
    ,(COUNT(CONCAT(agency, rid)) *100.0/ (SELECT COUNT(*) FROM {table}) ) AS freq_pct
    ,COUNT(CONCAT(agency, rid))

    FROM {table}
    WHERE loan_type='1' AND lien = '1' AND property_type = '2' AND CAST(rate_spread AS FLOAT) >= 15 
    AND rate_spread NOT LIKE '%NA%'AND rate_spread NOT LIKE '% %'
    GROUP BY year, CONCAT(agency, rid)"""
rate_spread_manu_df = compile_dfs(lar_tables, "rate_spread_manu_1st", sql)

   year       concat                    freq_pct  count
0  2004  10000023892  0.000002973493623670008338      1
1  2004  30000020100  0.000002973493623670008338      1
2  2004  30000000057      0.00010109878320478028     34
3  2004  30000010320  0.000002973493623670008338      1
4  2004  30000013881  0.000002973493623670008338      1
   year       concat                    freq_pct  count
0  2005  20001993734  0.000005485879401279899551      2
1  2005  30000008801  0.000002742939700639949776      1
2  2005  10000007899  0.000002742939700639949776      1
3  2005  71928200004  0.000002742939700639949776      1
4  2005  10000017927  0.000008228819101919849327      3
   year       concat                    freq_pct  count
0  2006  30000022094  0.000005855596310505876676      2
1  2006  30000016114  0.000002927798155252938338      1
2  2006  30000000384  0.000002927798155252938338      1
3  2006  338-3589255  0.000002927798155252938338      1
4  2006  20000397755  0.000002927798155252938338

In [17]:
sql = """
    SELECT year, denial_1
    ,(COUNT(denial_1)) *100.0/ (SELECT COUNT(*) FROM {table} WHERE action='3' AND loan_type='1') AS freq_pct
    FROM {table}
    WHERE loan_type='1' AND action='3'
    GROUP BY year, denial_1
    ORDER BY denial_1
"""
denial_freq_df = compile_dfs(lar_tables, "denial_freq", sql)

   year denial_1                freq_pct
0  2004              20.2742955769659419
1  2004        1     10.3221666973456415
2  2004        2  0.82150404851784792639
3  2004        3     23.7880286610304764
4  2004        4      9.0782293423931950
   year denial_1                freq_pct
0  2005              28.4504426610811403
1  2005        1      8.6761571903503538
2  2005        2  0.80318326290536673283
3  2005        3     19.0951173325864316
4  2005        4      8.7890753437168918
   year denial_1                freq_pct
0  2006              36.6788375934671176
1  2006        1      9.4056996570874119
2  2006        2  0.81936399507552399909
3  2006        3     18.0697731046754366
4  2006        4      9.7391290088331783
   year denial_1                freq_pct
0  2007              35.8031753848714899
1  2007        1     10.9454538518361812
2  2007        2  0.78092757583849790007
3  2007        3     17.9463347471587658
4  2007        4     12.3084139375674110
   year denial_1

In [18]:
#denial1 if loantype=2, action=3: freq, percent, cumulative percent
sql = """
SELECT year, denial_1
    ,COUNT(denial_1) *100.0/ 
    (SELECT COUNT(*) FROM {table} WHERE action='3' AND loan_type='1') AS freq_pct
    FROM {table}
    WHERE loan_type='2' AND action='3'
    GROUP BY year, denial_1
    ORDER by denial_1
"""
denial_loan_type_2_freq_df = compile_dfs(lar_tables, "denial_loan_type_2", sql)

   year denial_1                freq_pct
0  2004               1.0886856755593739
1  2004        1  1.00392182734063921737
2  2004        2  0.11455114393711082223
3  2004        3      2.1608783853686403
4  2004        4  0.31081743636339101373
   year denial_1                freq_pct
0  2005           0.87902509579664273187
1  2005        1  0.69590319728635736858
2  2005        2  0.08701288900284028862
3  2005        3      1.5546369473419587
4  2005        4  0.27833129170408820638
   year denial_1                freq_pct
0  2006               1.0321597838521870
1  2006        1  0.75872640318903987449
2  2006        2  0.07351864429820681479
3  2006        3      1.4937129314347431
4  2006        4  0.32086315186572926647
   year denial_1                freq_pct
0  2007               2.3340545257447699
1  2007        1      1.4639256357163670
2  2007        2  0.13379293731223633120
3  2007        3      2.5950931927532907
4  2007        4  0.78701433838467893443
   year denial_1

In [19]:
#denial1 if loanpurpose=3, aciton=3: freq, percent, cumulative percent

sql = """  SELECT year, denial_1
    ,COUNT(denial_1) *100.0/ 
    (SELECT COUNT(*) FROM {table} WHERE action='3' AND loan_type='1') AS freq_pct
    ,COUNT(*)
    FROM {table}
    WHERE purpose='3' AND action='3'
    GROUP BY year, denial_1
    ORDER BY denial_1
    """
denial_purpose_3_df = compile_dfs(lar_tables, "denial_purpose_3", sql)

   year denial_1             freq_pct   count
0  2004           34.0812643407586664  681916
1  2004        1  17.9314022569374162  358781
2  2004        2   1.1441120362165536   22892
3  2004        3  42.2879641832758329  846120
4  2004        4  18.5710294559370429  371579
   year denial_1             freq_pct    count
0  2005           55.9461389717285578  1119400
1  2005        1  15.6092926366904515   312319
2  2005        2   1.1273691770459899    22557
3  2005        3  33.9287293969122169   678864
4  2005        4  18.0437043600904214   361028
   year denial_1             freq_pct    count
0  2006           69.5358429629762906  1391310
1  2006        1  17.4325650110227988   348800
2  2006        2   1.0774404716388460    21558
3  2006        3  31.3207417036633876   626682
4  2006        4  20.9793023275572968   419765
   year denial_1             freq_pct    count
0  2007           71.4957570596140746  1430525
1  2007        1  19.3089647265441289   386344
2  2007        2   

In [20]:
# frequency by state
sql = """
SELECT year, state, count(*)
FROM {table}
GROUP BY year, state
ORDER BY count(*) DESC"""
state_freq = compile_dfs(lar_tables, "state_freq", sql)

   year state    count
0  2004    06  5348035
1  2004    12  2411564
2  2004    48  2096810
3  2004    17  1384136
4  2004    36  1313759
   year state    count
0  2005    06  5511038
1  2005    12  3007029
2  2005    48  2145151
3  2005    17  1467571
4  2005    36  1340588
   year state    count
0  2006    06  4919273
1  2006    12  2912471
2  2006    48  2077073
3  2006    17  1386588
4  2006    36  1298124
   year state    count
0  2007    06  3433324
1  2007    12  2009944
2  2007    48  1727705
3  2007    17  1123171
4  2007    36  1011325
   year state    count
0  2008    06  1888396
1  2008    48  1212874
2  2008    12   969544
3  2008    17   767739
4  2008    42   717445
   year state    count
0  2009    06  2198859
1  2009    48  1249002
2  2009    17   852154
3  2009    12   809914
4  2009    42   770928
   year state    count
0  2010    06  2030460
1  2010    48  1095372
2  2010    17   736745
3  2010    12   683767
4  2010    42   676945
   year state    count
0  2011    

In [21]:
#institutions with >10 single family, site-built, first lien, owner occupied, w/ rate spread >6.5 (hoepa)
sql = """
   SELECT year, CONCAT(agency, rid) AS ARID, COUNT(*)
   FROM {table}
   WHERE occupancy = '1' AND property_type='1' AND lien='1' AND CAST(rate_spread AS FLOAT) >6.5 
   AND rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '% %'
   
   GROUP BY year, CONCAT(agency,rid)
   HAVING COUNT(*) > 10
   ORDER BY COUNT(*) DESC
"""
hoepa_first_df = compile_dfs(lar_tables, "hoepa_first_occ_single_fam", sql)

   year         arid  count
0  2004  77756600001   3370
1  2004  152-2113031   2227
2  2004  20001644643   2163
3  2004  775-2921540   2018
4  2004  71958100003   1651
   year         arid  count
0  2005  775-2921540  14665
1  2005  20001644643  14325
2  2005  77604800006  13096
3  2005  77431100008   9671
4  2005  77756600001   8369
   year         arid  count
0  2006  20001644643  45145
1  2006  30000025653  34533
2  2006  70458600405  29703
3  2006  77604800006  25130
4  2006  20003197134  23347
   year         arid  count
0  2007  20001644643  22293
1  2007  10000023160  10814
2  2007  40510528989   9742
3  2007  20001881185   8941
4  2007  20002752527   8097
   year         arid  count
0  2008  20001881185   6611
1  2008  20002751986   5244
2  2008  10000001741   4012
3  2008  20002817118   3678
4  2008  20003197956   3478
   year         arid  count
0  2009  20002817118   2170
1  2009  10000001741   2094
2  2009  20002751986   1431
3  2009  20002751810   1116
4  2009  20001881185

In [22]:
#institutions with >10 single family, site-built, first (junior?) lien, owner occupied, w/ rate spread >8.5 (hoepa)
sql = """  SELECT year, CONCAT(agency, rid) AS ARID, COUNT(*)
   FROM {table}
   WHERE occupancy = '1' AND property_type='1' AND lien='2' 
   AND CAST(rate_spread AS FLOAT) >8.5 AND rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '% %'
   
   GROUP BY year, CONCAT(agency,rid)
   HAVING COUNT(*) > 10
   ORDER BY COUNT(*) DESC"""

hoepa_junior_df = compile_dfs(lar_tables, "hoepa_junior_occ_single_fam", sql)

   year         arid  count
0  2004  751-0003820   6027
1  2004  20002751986   4616
2  2004  736-1239445   4085
3  2004  723-2425397   2556
4  2004  77177000002   1960
   year         arid  count
0  2005  20001881185   8715
1  2005  20001993734   6201
2  2005  775-2921540   4803
3  2005  20002751986   4032
4  2005  77699300007   1697
   year         arid  count
0  2006  20001881185   8311
1  2006  20001993734   6020
2  2006  77699300007   4476
3  2006  20002751986   4064
4  2006  77604800006   3649
   year         arid  count
0  2007  20001881185   9308
1  2007  20003197956   6835
2  2007  20002751986   4649
3  2007  120-2096530   2114
4  2007  40510528989   1981
   year         arid  count
0  2008  20002751986   3490
1  2008  20001881185    947
2  2008  20000902270    826
3  2008  20000860473    710
4  2008  20003197956    568
   year         arid  count
0  2009  40000013964    498
1  2009  20002751986    207
2  2009  795-4390985    162
3  2009  20000902270    133
4  2009  10000001741

In [28]:
#agency, rid, name, prior year app count, count state=00 for institutions w >=500 apps with state=00 
#in the last year submission
sql = """
DROP TABLE IF EXISTS state_counts;
CREATE TEMP TABLE state_counts(
	arid TEXT
	,state_rows INTEGER)
ON COMMIT DROP;

INSERT INTO state_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table2}
WHERE state='00'
GROUP BY CONCAT(agency, rid)
HAVING count(*) >=500
ORDER BY count(*);

SELECT arid, state_rows, count(l2.*) AS app_count
FROM state_counts l2
LEFT JOIN {table} l1
ON l2.arid = CONCAT(l1.agency, l1.rid)
GROUP BY arid, state_rows"""

state_00_df = compile_dfs(lar_tables[1:], "state_00", sql, table2=True)

Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []
Empty DataFrame
Columns: [arid, state_rows, app_count, year]
Index: []


In [17]:
#institutions with >=250 applications with action<=3 with all applications having a single race
sql = """
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS race_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=250;

CREATE TEMP TABLE race_counts(
	arid TEXT
	,race_1 VARCHAR
	,race_rows INTEGER)
ON COMMIT DROP;

INSERT INTO race_counts
SELECT
	CONCAT(agency, rid)
	,race_1
	,COUNT(race_1)
FROM 
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid), race_1;

SELECT race_counts.arid, race_1, race_rows, lender_rows
FROM race_counts
INNER JOIN lar_counts ON race_counts.arid = lar_counts.arid 
WHERE (CASE WHEN race_rows=lender_rows THEN 1 ELSE 0 END) = 1
GROUP BY race_counts.arid, race_1, race_rows, lender_rows
ORDER BY race_counts.arid DESC"""

single_race_df = compile_dfs(lar_tables, "single_race", sql, table2=None)

          arid race_1  race_rows  lender_rows  year
0  795-4537645      7        286          286  2004
1  77428000003      6        692          692  2004
2  77307200009      6       2345         2345  2004
3  77086400004      6      10403        10403  2004
4  71841100007      6        272          272  2004
          arid race_1  race_rows  lender_rows  year
0  795-4607423      6       2657         2657  2005
1  795-4537645      7        312          312  2005
2  77428000003      6        512          512  2005
3  765-1113581      6        307          307  2005
4  50000008486      5        491          491  2005
          arid race_1  race_rows  lender_rows  year
0  795-4537645      7        302          302  2006
1  77756300009      6       4074         4074  2006
2  765-1113581      6        345          345  2006
3  752-1815063      6        965          965  2006
4  71557900001      6      60908        60908  2006
          arid race_1  race_rows  lender_rows  year
0  777563000

In [25]:
#institutions with >=250 applications with action<=3 with all applications having a single ethnicity

sql="""
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS eth_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=250;

CREATE TEMP TABLE eth_counts(
	arid TEXT
	,app_eth VARCHAR
	,eth_rows INTEGER)
ON COMMIT DROP;
INSERT INTO eth_counts
SELECT
	CONCAT(agency, rid)
	,app_eth
	,COUNT(app_eth)
FROM 
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid), app_eth;

SELECT eth_counts.arid, eth_rows, app_eth, lender_rows
FROM eth_counts
INNER JOIN lar_counts ON eth_counts.arid = lar_counts.arid 
WHERE (CASE WHEN eth_rows=lender_rows THEN 1 ELSE 0 END) = 1
ORDER BY eth_counts.arid DESC"""


single_eth_df =  compile_dfs(lar_tables, "single_ethnicity", sql, table2=None)

          arid  eth_rows app_eth  lender_rows  year
0  795-4537645       286       4          286  2004
1  77428000003       692       3          692  2004
2  77086400004     10403       3        10403  2004
3  766-0559119       365       1          365  2004
4  758-2509828      2354       2         2354  2004
          arid  eth_rows app_eth  lender_rows  year
0  79900000375       447       2          447  2005
1  795-4607423      2657       3         2657  2005
2  795-4537645       312       4          312  2005
3  77428000003       512       3          512  2005
4  77339400008       370       2          370  2005
          arid  eth_rows app_eth  lender_rows  year
0  79900000375       277       2          277  2006
1  795-4537645       302       4          302  2006
2  77248700009       645       1          645  2006
3  766-0188856       573       1          573  2006
4  765-1113581       345       3          345  2006
          arid  eth_rows app_eth  lender_rows  year
0  799000003

In [35]:
#institutions with >=50 applications with action<=3 with all applications having a single sex
sql="""
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS sex_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=50;

CREATE TEMP TABLE sex_counts(
	arid TEXT
	,app_sex VARCHAR
	,sex_rows INTEGER)
ON COMMIT DROP;
INSERT INTO sex_counts
SELECT
	CONCAT(agency, rid)
	,app_sex
	,COUNT(app_sex)
FROM 
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid), app_sex;

SELECT sex_counts.arid, sex_rows, app_sex, lender_rows
FROM sex_counts
INNER JOIN lar_counts ON sex_counts.arid = lar_counts.arid 
WHERE (CASE WHEN sex_rows=lender_rows THEN 1 ELSE 0 END) = 1
GROUP BY sex_counts.arid, app_sex, sex_rows, lender_rows
ORDER BY sex_counts.arid DESC
"""
#print(sql.format(table=lar_tables[-1:]))
single_sex_df = compile_dfs(lar_tables, "single_sex", sql, table2=None)

          arid  sex_rows app_sex  lender_rows  year
0  795-4537645       286       4          286  2004
1  77428000003       692       3          692  2004
2  71723400002       166       4          166  2004
3  71589900007       824       3          824  2004
4  70495100001        66       4           66  2004
          arid  sex_rows app_sex  lender_rows  year
0  795-4537645       312       4          312  2005
1  77428000003       512       3          512  2005
2  71723400002       145       4          145  2005
3  70495100001        97       4           97  2005
4  313-3399559        58       4           58  2005
          arid  sex_rows app_sex  lender_rows  year
0  795-4537645       302       4          302  2006
1  71723400002       131       4          131  2006
2  70495100001       166       4          166  2006
3  313-3399559        51       4           51  2006
4  30000016022       366       4          366  2006
          arid  sex_rows app_sex  lender_rows  year
0  717234000

In [36]:
#institutions with >=50 applications with action<=3 with all applications having a single co-app sex or no co-app
sql="""
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS sex_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=50;

CREATE TEMP TABLE sex_counts(
	arid TEXT
	,co_app_sex VARCHAR
	,co_sex_rows INTEGER)
ON COMMIT DROP;
INSERT INTO sex_counts
SELECT
	CONCAT(agency, rid)
	,co_app_sex
	,COUNT(co_app_sex)
FROM 
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid), co_app_sex;

SELECT sex_counts.arid, co_sex_rows, co_app_sex, lender_rows
FROM sex_counts
INNER JOIN lar_counts ON sex_counts.arid = lar_counts.arid 
WHERE (CASE WHEN co_sex_rows=lender_rows THEN 1 ELSE 0 END) = 1
ORDER BY sex_counts.arid DESC"""
single_cosex_df = compile_dfs(lar_tables, "single_cosex", sql, table2=None)

          arid  co_sex_rows co_app_sex  lender_rows  year
0  795-4830942          550          5          550  2004
1  795-4537645          286          4          286  2004
2  77428000003          692          5          692  2004
3  71723400002          166          4          166  2004
4  70495100001           66          4           66  2004
          arid  co_sex_rows co_app_sex  lender_rows  year
0  795-4537645          312          4          312  2005
1  77428000003          512          5          512  2005
2  77307200009         1269          5         1269  2005
3  71723400002          145          5          145  2005
4  70495100001           97          4           97  2005
          arid  co_sex_rows co_app_sex  lender_rows  year
0  795-4659941         1201          5         1201  2006
1  795-4537645          302          4          302  2006
2  794-3306143          149          5          149  2006
3  775-2958981         2706          5         2706  2006
4  734-1180767

In [39]:
#institutions with >=50 applications with action<=3 with >50% applications to same sex couples
sql="""
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS sex_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action IN ('1', '2', '3')
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=50;

DROP TABLE IF EXISTS sex_count;
CREATE TEMP TABLE sex_counts(
	arid TEXT
	,same_sex_count INTEGER)
ON COMMIT DROP;
INSERT INTO sex_counts
SELECT 
	CONCAT(agency, rid)
	,(CASE WHEN app_sex = co_app_sex THEN COUNT(app_sex) ELSE 0 END)
FROM 
	{table}
WHERE action IN ('1', '2', '3') AND app_sex IN ('1', '2') AND co_app_sex IN ('1', '2')
GROUP BY CONCAT(agency,rid), app_sex, co_app_sex;

SELECT lar_counts.arid, lender_rows, same_sex_count, (sex_counts.same_sex_count *1.0 / lender_rows)*100.0 AS same_sex_pct
FROM lar_counts 
INNER JOIN sex_counts
ON lar_counts.arid = sex_counts.arid
WHERE (sex_counts.same_sex_count *1.0 / lender_rows)*100.0 >50
"""
same_sex_df = compile_dfs(lar_tables, "same_sex_50_pct", sql, table2=None)

          arid  lender_rows  same_sex_count              same_sex_pct  year
0  10000014254          252             152  60.317460317460317460000  2004
          arid  lender_rows  same_sex_count              same_sex_pct  year
0  10000014254          177             100  56.497175141242937853000  2005
Empty DataFrame
Columns: [arid, lender_rows, same_sex_count, same_sex_pct, year]
Index: []
          arid  lender_rows  same_sex_count              same_sex_pct  year
0  10000014254          182              96  52.747252747252747253000  2007
          arid  lender_rows  same_sex_count              same_sex_pct  year
0  10000014254          212             112  52.830188679245283019000  2008
          arid  lender_rows  same_sex_count              same_sex_pct  year
0  10000014254          212             115  54.245283018867924528000  2009
Empty DataFrame
Columns: [arid, lender_rows, same_sex_count, same_sex_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, same_sex_coun

In [44]:
#institions with >=100 originations, where >=75% are home purchase or refi 
#and >75% are junior liens or unsecured
sql="""
DROP TABLE IF EXISTS lar_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=100;

DROP TABLE IF EXISTS type_counts;
CREATE TEMP TABLE type_counts(
	arid TEXT
	,purch_refi_count INTEGER)
ON COMMIT DROP;
INSERT INTO type_counts
SELECT 
	DISTINCT CONCAT(agency, rid)
	,(CASE WHEN purpose IN ('1', '3') THEN COUNT(purpose) ELSE 0 END)
FROM 
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency,rid), purpose, lien;

DROP TABLE IF EXISTS junior_counts;
CREATE TEMP TABLE junior_counts(
	arid TEXT
	,junior_count INTEGER)
ON COMMIT DROP;
INSERT INTO junior_counts
SELECT 
	DISTINCT CONCAT(agency, rid)
	,(CASE WHEN lien !='1' THEN COUNT(lien) ELSE 0 END)
FROM 
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency,rid), purpose, lien;

SELECT lar_counts.arid, lender_rows, (purch_refi_count*1.0/lender_rows)*100 AS purch_pct, (junior_count*1.0/lender_rows)*100 AS junior_pct
FROM lar_counts 
INNER JOIN type_counts
ON lar_counts.arid = type_counts.arid
INNER JOIN junior_counts
ON lar_counts.arid = junior_counts.arid
WHERE (purch_refi_count*1.0/lender_rows)*10 >75 AND (junior_count*1.0/lender_rows)*100>75
ORDER BY (purch_refi_count*1.0/lender_rows)*100, (junior_count*1.0/lender_rows)*100 DESC"""
junior_or_purch_df = compile_dfs(lar_tables, "junior_or_purch", sql, table2=None)

Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, purch_pct, junior_pct, year]
In

In [47]:
#institutions with >=25 loans and >=50% of loans are over 10 million
sql="""
DROP TABLE IF EXISTS type_counts;
DROP TABLE IF EXISTS lar_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=25;

DROP TABLE IF EXISTS type_count;
CREATE TEMP TABLE type_counts(
	arid TEXT
	,amount_count INTEGER)
ON COMMIT DROP;
INSERT INTO type_counts
SELECT 
	DISTINCT CONCAT(agency, rid)
	,(CASE WHEN CAST(amount AS INTEGER)>10000 THEN COUNT(amount) ELSE 0 END)
FROM 
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency,rid), amount;

SELECT lar_counts.arid, lender_rows, amount_count, (amount_count*1.0/lender_rows)*100 AS larg_pct
FROM lar_counts 
INNER JOIN type_counts
ON lar_counts.arid = type_counts.arid
WHERE (amount_count*1.0/lender_rows)*100 >50
ORDER BY (amount_count*1.0/lender_rows)*100 DESC"""

ten_mil_df = compile_dfs(lar_tables, "ten_mil", sql, table2=None)

Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_pct, year]
Index: []
Empty DataFrame
Columns: [arid, lender_rows, amount_count, larg_p

In [50]:
#institutions with >=50 home purchase, first-lien owner-occupied loans 
#where income is NA in >25% of loans
sql="""
DROP TABLE IF EXISTS lar_counts;
DROP TABLE IF EXISTS type_counts;
CREATE TEMP TABLE lar_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lar_counts
SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action = '1' AND purpose='1' AND lien='1' AND occupancy='1'
GROUP BY CONCAT(agency, rid)
HAVING COUNT(*) >=50;

DROP TABLE IF EXISTS type_count;
CREATE TEMP TABLE type_counts(
	arid TEXT
	,na_income_count INTEGER)
ON COMMIT DROP;
INSERT INTO type_counts
SELECT 
	DISTINCT CONCAT(agency, rid)
	,(CASE WHEN income LIKE '%NA%' THEN COUNT(income) ELSE 0 END)
FROM 
	{table}
WHERE action = '1' AND purpose='1' AND lien='1' AND occupancy='1'
GROUP BY CONCAT(agency,rid), income;

SELECT lar_counts.arid, lender_rows, na_income_count, (na_income_count*1.0/lender_rows)*100 AS na_pct
FROM lar_counts 
INNER JOIN type_counts
ON lar_counts.arid = type_counts.arid
WHERE (na_income_count*1.0/lender_rows)*100 >50
ORDER BY (na_income_count*1.0/lender_rows)*100 DESC"""
na_income_df = compile_dfs(lar_tables, "na_income", sql, table2=None)

          arid  lender_rows  na_income_count                    na_pct  year
0  774-2832600           93               93  100.00000000000000000000  2004
1  765-0367752          257              257  100.00000000000000000000  2004
2  77343000001         3397             3395   99.94112452163673829800  2004
3  71665100001         1532             1412   92.16710182767624020900  2004
4  77935900009         1199              971   80.98415346121768140100  2004
          arid  lender_rows  na_income_count                    na_pct  year
0  765-0367752          243              243  100.00000000000000000000  2005
1  77428000003          409              409  100.00000000000000000000  2005
2  77951500006          671              656   97.76453055141579731700  2005
3  71665100001         1491             1396   93.62843729040912139500  2005
4  71045600000         2486             2253   90.62751407884151247000  2005
          arid  lender_rows  na_income_count                    na_pct  year

In [55]:
#institutions with >=50 home purchase, first-lien owner-occupied loans 
#where income >=1 million in >50% of loans
sql = """
DROP TABLE IF EXISTS income_mil;
CREATE TEMP TABLE income_mil(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO income_mil
SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE CAST(income AS INTEGER) >=1000 AND income NOT LIKE '%NA%' AND income NOT LIKE '%na%' 
AND action='1'
GROUP BY CONCAT(agency, rid);

SELECT 
	agency, rid, income_mil.lender_rows, 
    count(*),(income_mil.lender_rows*1.0 / (SELECT COUNT(*) FROM {table}))*100
FROM {table} lar
LEFT JOIN income_mil ON CONCAT(lar.agency, lar.rid) = income_mil.arid
WHERE purpose='1' AND lien='1' AND occupancy='1' 
GROUP BY agency, rid, lender_rows
HAVING income_mil.lender_rows / (SELECT COUNT(*) FROM {table} ) > .5;"""
income_1_mil_df = compile_dfs(lar_tables, "income_1_mil", sql, table2=None)


Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?column?, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, count, ?colum

In [58]:
#institutions with >=50 home purchase, first-lien owner-occupied loans where income < 10k in >50% of loans
sql="""--#institutions with >=50 home purchase, first-lien owner-occupied loans where income < 10k in >50% of loans
DROP TABLE IF EXISTS lender_counts;
CREATE TEMP TABLE lender_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lender_counts
SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
FROM
	{table}
WHERE action = '1'
GROUP BY CONCAT(agency, rid);

SELECT DISTINCT
	agency, rid, lender_rows, (count(*)*1.0 / lender_rows)*100 as pct_low_inc
FROM {table} lar
LEFT JOIN lender_counts ON lender_counts.arid = CONCAT(lar.agency, lar.rid)
WHERE purpose = '1' AND lien = '1' AND occupancy = '1' AND 
CAST(income AS INTEGER) <10 AND income NOT LIKE '%NA%' AND income NOT LIKE '%na%'
GROUP BY agency, rid, lender_rows
HAVING COUNT(*) >= 50 AND (count(*)*1.0 / lender_rows)*100 > 50;"""

income_10k_df = compile_dfs(lar_tables, "income_10k", sql, table2=None)

Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
  agency         rid  lender_rows              pct_low_inc  year
0      7  7162800002         1272  94.88993710691823899400  2005
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_inc, year]
Index: []
Empty DataFrame
Columns: [agency, rid, lender_rows, pct_low_

In [65]:
#institutions with >=100 FHA first-lien home purchase loans and none that are higher-priced
#
sql = """SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*) AS loan_count
	,(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT)>1.5 THEN COUNT(*) ELSE 0 END) AS high_price_count
FROM
	{table}
WHERE action = '1' AND loan_type = '2' AND lien = '1' AND purpose = '1' 
GROUP BY CONCAT(agency, rid), rate_spread
HAVING COUNT(*) >=100 AND 
(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT)>1.5 THEN COUNT(*) ELSE 0 END) <1;"""
fha_high_priced_df = compile_dfs(lar_tables, "fha_high_priced", sql, table2=None)

        concat  loan_count  high_price_count  year
0  77196900005         209                 0  2004
1  77692500004         104                 0  2004
2  77479600006         310                 0  2004
3  184-1106939         976                 0  2004
4  71474600000        1707                 0  2004
        concat  loan_count  high_price_count  year
0  77257500009         590                 0  2005
1  30000032489         197                 0  2005
2  10000024444         258                 0  2005
3  20003340967        1722                 0  2005
4  77184500000         980                 0  2005
        concat  loan_count  high_price_count  year
0  73028209994         344                 0  2006
1  77868600006         139                 0  2006
2  77840400007         146                 0  2006
3  77638200000         220                 0  2006
4  77184500000         523                 0  2006
        concat  loan_count  high_price_count  year
0  77765600003         168     

In [68]:
#institutions with >=100 manufactured loans and none that are higher-priced
sql = """SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
	,(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%'
    AND CAST(rate_spread AS FLOAT) >1.5 THEN COUNT(*) ELSE 0 END) AS first_lien_high
	,(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%'
    AND CAST(rate_spread AS FLOAT) >3.5 THEN COUNT(*) ELSE 0 END) AS junior_lien_high
FROM
	{table}
WHERE action = '1' AND property_type='2'
GROUP BY CONCAT(agency, rid), rate_spread
HAVING COUNT(*) >=100 AND 
((CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%'
AND CAST(rate_spread AS FLOAT) >1.5 THEN COUNT(*) ELSE 0 END) <1 
OR 
(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
AND CAST(rate_spread AS FLOAT) >3.5 THEN COUNT(*) ELSE 0 END)<1 );"""
manufactured_high_priced_df = compile_dfs(lar_tables, "manu_high_priced", sql, table2=None)

        concat  count  first_lien_high  junior_lien_high  year
0  72007800007    170                0                 0  2004
1  77952900006   1015                0                 0  2004
2  20001072246    527                0                 0  2004
3  10000007745    647                0                 0  2004
4  40000007101    200                0                 0  2004
        concat  count  first_lien_high  junior_lien_high  year
0  723-2425397    102                0                 0  2005
1  73833009998    117                0                 0  2005
2  10000015108    125                0                 0  2005
3  40000004410    409                0                 0  2005
4  77499100008   3006                0                 0  2005
        concat  count  first_lien_high  junior_lien_high  year
0  184-1106939    107                0                 0  2006
1  71556900003    772                0                 0  2006
2  20000542649    173                0                 

In [71]:
#institutions with at least 100 loans and >50% are higher priced
sql = """--institutions with at least 100 loans and >50% are higher priced
DROP TABLE IF EXISTS lender_counts;
CREATE TEMP TABLE lender_counts(
	arid TEXT
	,lender_rows INTEGER)
ON COMMIT DROP;

INSERT INTO lender_counts
SELECT
	DISTINCT CONCAT(agency, rid)
	,COUNT(*)
FROM {table}
WHERE action = '1'
GROUP BY agency, rid;

SELECT
	DISTINCT CONCAT(agency, rid)
	,lender_rows
	,(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >1.5 AND lien = '1' THEN COUNT(*) ELSE 0 END) AS first_lien_high
	,(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >3.5 AND lien = '2' THEN COUNT(*) ELSE 0 END) AS junior_lien_high
	,(((CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >1.5 THEN COUNT(*) ELSE 0 END) + 
	(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >3.5 AND lien = '2' THEN COUNT(*) ELSE 0 END))*1.0 
    / lender_rows) *100 AS high_priced_pct
FROM
	{table}
LEFT JOIN lender_counts ON lender_counts.arid = CONCAT(agency, rid)
WHERE action = '1' AND rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' AND lender_rows >=100
GROUP BY CONCAT(agency, rid), rate_spread, lien, lender_rows
HAVING ((((CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >1.5 THEN COUNT(*) ELSE 0 END) + 
	(CASE WHEN rate_spread NOT LIKE '%NA%' AND rate_spread NOT LIKE '%na%' 
    AND CAST(rate_spread AS FLOAT) >3.5 AND lien = '2' 
    THEN COUNT(*) ELSE 0 END))*1.0 / lender_rows) *100) > 50
"""
half_high_priced = compile_dfs(lar_tables, "half_high_priced", sql, table2=None)

        concat  lender_rows  first_lien_high  junior_lien_high  \
0  50000006912          219                0                96   
1  50000022637          103              103                 0   
2  50000068239          257                0               133   
3  77059200001          229              220                 0   
4  77987800001          482              480                 0   

            high_priced_pct  year  
0   87.67123287671232876700  2004  
1  100.00000000000000000000  2004  
2      103.5019455252918300  2004  
3   96.06986899563318777300  2004  
4   99.58506224066390041500  2004  
        concat  lender_rows  first_lien_high  junior_lien_high  \
0  50000019269          161                0               119   
1  50000061387          152                0                42   
2  70828200015          357              357                 0   
3  77059200001          115              115                 0   

            high_priced_pct  year  
0      147.826086956

KeyboardInterrupt: 

In [None]:
#Amount for all products together
amount_all_df = get_lar_stats("amount", lar_tables)
amount_all_df.to_csv("../output/amounts_all.csv", sep="|", index=False)

In [None]:
#amount for multifamily
multi_where = """WHERE property_type = '3' """
amount_multi_df = get_lar_stats("amount", lar_tables, where=multi_where)
amount_multi_df.to_csv("../output/amount_multi.csv", sep="|", index=False)

In [None]:
#amount for manufactured
manu_where = """WHERE property_type = '2' """
amount_manu_df = get_lar_stats("amount", lar_tables, where=manu_where)
amount_manu.to_csv("../output/amount_manufactured.csv", sep="|", index=False)

In [None]:
#amount for all single family
single_where = """WHERE property_type = '1' """
amount_single_df = get_lar_stats("amount", lar_tables, where=single_where)
amount_single_df.to_csv("../output/amount_single.csv", sep="|", index=False)

In [None]:
#income for all products together
income_all_df = get_lar_stats("income", lar_tables)
income_all_df.to_csv("../output/income_all.csv", sep="|", index=False)


In [None]:
#income for multifamily
multi_where = """WHERE property_type = '3' """
income_multi_df = get_lar_stats("income", lar_tables, where=multi_where)
income_multi_df.to_csv("../output/income_multi.csv", sep="|", index=False)

In [None]:
#income for manufactured
manu_where = """WHERE property_type = '2' """
income_manu_df = get_lar_stats("income", lar_tables, where=manu_where)
income_manu_df 

In [None]:
#income for all single family
single_where = """WHERE property_type = '1' """
income_single_df = get_lar_stats("income", lar_tables, where=single_where)
income_single_df.to_csv("../output/income_single_fam.csv", sep="|", index=False)

In [None]:
#sort HMDA data by census tract to determine the geographic dispersion of lending among the various census tracts in the institution's assessment area, including low- and moderate-income tracts.
#https://philadelphiafed.org/-/media/community-development/data-dashboard/pdfs/hmda-documentation.pdf?la=en

In [None]:
#ethnicity if race=5: frequency, percent, cumulative percent
#income for action=1: percentiles, mean, stdev, skew, kurtosis
#income for action=3: percentiles, mean, stdev, skew, kurtosis
#income for action=1, loan_type=1: percentiles, mean, stdev, skew, kurtosis
#income for action=1, loan_type=2: percentiles, mean, stdev, skew, kurtosis