In [55]:
##############
#K. David Roell CFPB 7/12/16
#expands data selection for quality edits to include context for developing statistical approaches
#will attemtp to segment mortgage market participants by loan activity by:
#volume/count of lending, income of applicants, securitization, property type and other factors
##############

import json
import os
import pandas as pd
import psycopg2
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
get_ipython().magic(u'matplotlib inline')
matplotlib.style.use('ggplot')
#from macro_sql import Q076 
with open('quality_sql.json') as f: #FIXME change to quality_sql.json and refactor the rest of code
    edit_sql = json.load(f)
    
#parameter format for local use #consider changing hmdamaster to roellk if db changes cause a fail
params = {
'dbname':'hmdamaster',
'user':'roellk',
'password':'',
'host':'localhost',

}


try:
    conn = psycopg2.connect(**params)
    cur = conn.cursor()
    print("i'm connected")

except psycopg2.Error as e: #if database connection results in an error print the following
    print("I am unable to connect to the database: ", e)




i'm connected


In [59]:
def desc_stats_geo(table='hmdalar2014', field= 'amount', where_max=None, where_min=None, geo_level=None):
"""generates SQL statement to get descriptive statistics for a LAR field and group them by a geography if one is specified.
min and max can also be specified to focus analysis on a specific numeric range"""
    #FIXME change default values to none and implement value error
    #establish geo level for grouping
    if geo_level == 'MSA':
        geo_sql = 'msa AS msa,'
        geo_group = ',msa'
    elif geo_level == 'state':
        geo_sql = 'state AS state,'
        geo_group = ',state'
    elif geo_level == 'county':
        geo_sql = 'CONCAT(state,county) AS county,'
        geo_group = ',CONCAT(state,county)'
    elif geo_level == 'tract':
        geo_sql = 'CONCAT(state,county,tract) AS tract,'
        geo_group = ',CONCAT(state,county,tract)'
    else:
        geo_sql = ''
        geo_group = ''
        
    base_sql = """SELECT year, {geo_level} COUNT({field}) AS count_loans, SUM({field}::INT) AS sum_amt, AVG({field}::INT) AS avg_{field}, 
    STDDEV({field}::INT) AS std_{field}, MIN({field}::INT) AS min_{field}, 
    MAX({field}::INT) AS max_{field}""".format(field=field, geo_level=geo_sql)
    table_sql = "\nFROM {table}\n".format(table=table) 
    where_sql = "WHERE property_type = '1' AND loan_purpose = '1' AND agency != '7' AND {field} NOT ILIKE '%NA%' ".format(field=field)
    if where_max:
        where_sql = where_sql + " AND {field}::INT < ".format(field=field) + str(where_max)
    if where_min:
        where_sql = where_sql + " AND {field}::INT > ".format(field=field) + str(where_min)
    group_sql = """ GROUP BY year {geo_level}""".format(geo_level=geo_group)
    
    return_sql = base_sql + table_sql + where_sql + group_sql
    print(return_sql)
    return return_sql 

In [67]:

#get descriptive statistics for the field named
#FIXME add an optional set of limits to look at sub distributions min/max= 10/600? set by cumulative probability
def get_desc_stats(geo_level=None, field='amount', year=2014, where_max=None, where_min=None):
    """"""
    #FIXME change default values to none and implement value error
    first = True
    while year > 2003: #loop over all years until schema change in LAR data
        table = 'hmdalar' + str(year) #set table name for query
        
        print(table)
        #cur.execute(desc_stats(table=table, field=field))        
        cur.execute(desc_stats_geo(table=table, field=field, where_max=where_max, where_min=where_min, geo_level=geo_level))        
        data_df = pd.DataFrame(cur.fetchall()) #convert query results to dataframe
        col_names = [desc[0] for desc in cur.description] #pull column names from cursor
        
        if len(data_df.columns) == len(col_names): #verify data was returned before naming columns
            print('setting column names')
            data_df.columns = col_names
        if first == True and len(data_df.columns) > 0: 
            dist_df = data_df.copy() #establish initial data frame
            first = False
        elif first == False and len(data_df.columns) > 0:
            print('merging dataframes')
            dist_df=dist_df.merge(data_df, how='outer') #merge subsequent data into initial data frame
        else:
            print('no data returned from query')
        
        #set upper and lower bounds on distribution
        dist_df['5_pct'] = dist_df['avg_{field}'.format(field=field)] - 2*dist_df['std_{field}'.format(field=field)]
        dist_df['95_pct'] = dist_df['avg_{field}'.format(field=field)] + 2*dist_df['std_{field}'.format(field=field)]
        
        print(dist_df.tail()) #print data to check results during execution
        
        path = 'dist_csvs/'
        if not os.path.exists(path):
            os.makedirs(path)
        year -= 1 #decrement year to select new table

    dist_df.to_csv(path + '{field}_desc_stats_{geo_level}.csv'.format(field=field, geo_level=geo_level),index=False)



In [70]:
get_desc_stats(geo_level='state', field='income')

hmdalar2014
SELECT year, state, COUNT(income) AS count_loans, SUM(income::INT) AS sum_amt, AVG(income::INT) AS avg_income, 
    STDDEV(income::INT) AS std_income, MIN(income::INT) AS min_income, 
    MAX(income::INT) AS max_income
FROM hmdalar2014
WHERE property_type = '1' AND loan_purpose = '1' AND agency != '7' AND income NOT ILIKE '%NA%'  GROUP BY year ,state
setting column names
    year state  count_loans  sum_amt            avg_income        std_income  \
48  2014    72        14314   970759   67.8188486796143636  128.684951021791   
49  2014    21        37289  3150960   84.5010592936254660  109.510473103616   
50  2014    53        68765  8254622  120.0410383189122373  150.972822266404   
51  2014    42        91582  9404526  102.6896770107663078  144.753674505933   
52  2014    15         8825  1569020  177.7926345609065156  284.448646456045   

    min_income  max_income                  5_pct                95_pct  
48           1        8673  -189.5510533639676364  325.1887

In [None]:
get_desc_stats(geo_level='msa', field='amount')

hmdalar2014
SELECT year,  COUNT(amount) AS count_loans, SUM(amount::INT) AS sum_amt, AVG(amount::INT) AS avg_amount, 
    STDDEV(amount::INT) AS std_amount, MIN(amount::INT) AS min_amount, 
    MAX(amount::INT) AS max_amount
FROM hmdalar2014
WHERE property_type = '1' AND loan_purpose = '1' AND agency != '7' AND amount NOT ILIKE '%NA%'  GROUP BY year 
setting column names
   year  count_loans    sum_amt            avg_amount        std_amount  \
0  2014      3438718  790062749  229.7550276004022429  300.103449094670   

   min_amount  max_amount                  5_pct                95_pct  
0           1       99999  -370.4518705889377571  829.9619257897422429  
hmdalar2013
SELECT year,  COUNT(amount) AS count_loans, SUM(amount::INT) AS sum_amt, AVG(amount::INT) AS avg_amount, 
    STDDEV(amount::INT) AS std_amount, MIN(amount::INT) AS min_amount, 
    MAX(amount::INT) AS max_amount
FROM hmdalar2013
WHERE property_type = '1' AND loan_purpose = '1' AND agency != '7' AND amount NOT ILIKE