In [135]:
import json
import requests
import pandas as pd
import io


def connect_ss(UID=None, PWD=None, DSN="wdcsqlaw02"):
    """Connect to SQL Server and return a connection"""
    conn = pyodbc.connect(DSN=DSN, UID=creds["uid"], PWD=creds["pwd"]) #works on putty and EoD
    print("Connected SQL Server")
    return conn

with open("../creds.json", 'r') as f:
    creds = json.load(f)
    
#https://raw.githubusercontent.com/cfpb/hmda-platform/master/model/jvm/src/main/resources/tract_to_cbsa_2015.txt
#https://raw.githubusercontent.com/cfpb/hmda-platform/master/census/src/main/resources/state.csv

In [143]:
state_data_url = 'https://raw.githubusercontent.com/cfpb/hmda-platform/master/census/src/main/resources/state.csv'
state_data = requests.get(state_data_url).content
state_codes = pd.read_csv(io.StringIO(state_data.decode('utf-8')),sep="|", dtype=object)
state_codes.head()

Unnamed: 0,STATE,STUSAB,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778


In [153]:
cbsa_url = 'https://raw.githubusercontent.com/cfpb/hmda-platform/master/model/jvm/src/main/resources/tract_to_cbsa_2015.txt'
cbsa_data = requests.get(cbsa_url).content
use_cols = ['state','countyFips', 'geoIdMsa', 'metDivFp', 'tracts']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
             'stateCode', 'tractDecimal']
cbsa_df = pd.read_csv(io.StringIO(cbsa_data.decode('utf-8')),sep="|", usecols=use_cols, 
                      header=None, names=cbsa_cols,dtype=object)
cbsa_df.head()

Unnamed: 0,state,countyFips,tracts,geoIdMsa,metDivFp
0,6,6037,535501,31080,31084.0
1,41,41005,22208,38900,
2,4,4013,610900,38060,
3,42,42017,103400,37980,33874.0
4,37,37183,54403,39580,


In [9]:
#load HMDA LAR data from test file
lar_data = pd.read_csv("data/hmda_data_state_01.txt", sep="|", dtype=object)
lar_data.head()

Unnamed: 0,MSA_MD,State_Code,County_Code,Census_Tract_Number,Area_Population,Minority_Population_pct,FFIEC_Median_Family_Income,Tract_to_MSA_MD_Income,Num_of_Owner_occupied_units,Num_of_1_to_4_Family_units,fips
0,33860,1,1,201.0,1912,16.27,60400,132.09,598,751,1001020100
1,33860,1,1,203.0,3373,24.76,60400,91.74,957,1349,1001020300
2,33860,1,1,208.02,10435,17.12,60400,118.13,3113,3865,1001020802
3,33860,1,1,205.0,10766,21.62,60400,137.89,2406,3295,1001020500
4,33860,1,1,207.0,2891,20.65,60400,72.34,710,1246,1001020700


In [104]:
#Census SF1 data fields for population.
#SF1 2010 variable list: https://api.census.gov/data/2010/sf1/variables.html
#ACS 1 variable list: https://api.census.gov/data/2015/acs/acs1/variables.html
#ACS 5 variable list: https://api.census.gov/data/2015/acs/acs5/variables.html
#P009001 is total by geography (tract)
#P009002 Hispanic or Latino population
#P009005 Not Hispanic or Latino: Population of one race: White alone
#P009006 Not Hispanic or Latino: Population of one race: Black or African American alone
#P009007 Not Hispanic or Latino: Population of one race: American Indian and Alaska Native alone
#P009008 Not Hispanic or Latino: Population of one race: Asian alone
#P009009 Not Hispanic or Latino: Population of one race: Native Hawaiian and Other Pacific Islander alone
#P009010 Not Hispanic or Latino: Population of one race: Some Other Race alone
#P009011 Not Hispanic or Latino: Population of one race: Two or More Races:
sf2010_data_fields = ['P0090001','P0090005']


#sf2010_url = 'https://api.census.gov/data/2010/sf1?get=P0090001,P0090002,P0090005,P0090006,P0090007,P0090008,P0090009,P0090010,P0090011&for=tract:*&in=state:02'

#build SF1 2010 URL
sf2010_url = 'https://api.census.gov/data/2010/sf1?get='
first = True
for field in sf2010_data_fields:
    if first:
        sf2010_url = sf2010_url +field #add firest field and no comma
        first=False
    else:
        sf2010_url = sf2010_url + ","+field #add fields from list
sf2010_url = sf2010_url + "&for=tract:*&in=state:01"
print(sf2010_url)
sf2010_data = requests.get(sf2010_url)
sf2010_df = pd.DataFrame(json.loads(sf2010_data.content))
sf2010_df.drop([0], inplace=True)
sf2010_pop_cols = ["total", "white_only", "state", "county", "tract"]

sf2010_df.columns = sf2010_pop_cols
sf2010_df['fips'] = sf2010_df.apply(lambda x: x.state + x.county + x.tract, axis=1)
#sf2010_df['min_pct'] = sf2010_df.apply(lambda x: (int(x).total - int(x.white_only))/int(x.total), axis=1)
def min_pct(row):
    """Divides minority population by total to get percent if total populatin is >0"""
    if row['total'] > 0:
        row['min_pct'] = round((row['min_pop'] / row['total'] * 100),2)
    else:
        row['min_pct'] = 0
    return row['min_pct']

for col in sf2010_pop_cols[:-3]:
    sf2010_df[col] = sf2010_df[col].map(lambda x: int(x))
sf2010_df['min_pop'] = sf2010_df.apply(lambda x: x.total-x.white_only, axis=1)
sf2010_df['min_pct'] = sf2010_df.apply(lambda x: min_pct(x), axis=1)
sf2010_df.head(10)

https://api.census.gov/data/2010/sf1?get=P0090001,P0090005&for=tract:*&in=state:01


Unnamed: 0,total,white_only,state,county,tract,fips,min_pop,min_pct
1,1912,1601,1,1,20100,1001020100,311,16.27
2,2170,844,1,1,20200,1001020200,1326,61.11
3,3373,2538,1,1,20300,1001020300,835,24.76
4,4386,4030,1,1,20400,1001020400,356,8.12
5,10766,8438,1,1,20500,1001020500,2328,21.62
6,3668,2672,1,1,20600,1001020600,996,27.15
7,2891,2294,1,1,20700,1001020700,597,20.65
8,3081,2660,1,1,20801,1001020801,421,13.66
9,10435,8649,1,1,20802,1001020802,1786,17.12
10,5675,4786,1,1,20900,1001020900,889,15.67


In [101]:
lar_data.head(20)

Unnamed: 0,MSA_MD,State_Code,County_Code,Census_Tract_Number,Area_Population,Minority_Population_pct,FFIEC_Median_Family_Income,Tract_to_MSA_MD_Income,Num_of_Owner_occupied_units,Num_of_1_to_4_Family_units,fips
0,33860,1,1,201.0,1912,16.27,60400,132.09,598,751,1001020100
1,33860,1,1,203.0,3373,24.76,60400,91.74,957,1349,1001020300
2,33860,1,1,208.02,10435,17.12,60400,118.13,3113,3865,1001020802
3,33860,1,1,205.0,10766,21.62,60400,137.89,2406,3295,1001020500
4,33860,1,1,207.0,2891,20.65,60400,72.34,710,1246,1001020700
5,33860,1,1,210.0,2894,23.7,60400,93.88,931,1281,1001021000
6,33860,1,1,209.0,5675,15.67,60400,98.6,1676,2210,1001020900
7,33860,1,1,202.0,2170,61.11,60400,84.6,439,816,1001020200
8,33860,1,1,211.0,3320,56.81,60400,89.8,1016,1482,1001021100
9,33860,1,1,204.0,4386,8.12,60400,115.73,1549,1906,1001020400


In [125]:
#ACS5 data fields
#B19013_001E median family income (MFI)
#B25032_002E number of owner occupied units
#B25024_001E: Estimate Total (not used)
#B25024_002E: Estimate Total 1, detached
#B25024_003E: Estimate Total 1, attached
#B25024_004E: Estimate Total 2
#B25024_005E: Estimate Total 3 or 4
#B25024_010E Estimate Total Renter occupied Mobile home
#B25024_011E: Estimate Total Renter occupied Boat, RV, van, etc.
#B25035_001E: Median Housing Age
acs5_fields = ['B19013_001E','B25032_002E','B25024_002E','B25024_003E','B25024_004E','B25024_005E','B25024_010E',
               'B25024_011E','B25035_001E']
#acs5_url = "https://api.census.gov/data/2010/acs5?get=B19013_001E,B25032_002E,B25024_002E,B25024_003E,B25024_004E,B25024_005E,B25024_010E,B25024_011E,B25035_001E&for=tract:*&in=state:01"
first = True
acs5_url = "https://api.census.gov/data/2010/acs5?get="
for col in acs5_fields:
    if first:
        acs5_url = acs5_url + col
        first = False
    else:
        acs5_url = acs5_url + "," + col
acs5_url = acs5_url + "&for=tract:*&in=state:01"
print(acs5_url)
acs5_tract = requests.get(acs5_url)
acs5_tract
acs5_df = pd.DataFrame(json.loads(acs5_tract.content))
acs5_df.drop([0], inplace=True)
acs5_cols = ["mfi", "onwer_occ", "attached", "detached", "2_units", "3_or_4_units", "mobile", "boat_van",
             "median_yr_built", "state", "county", "tract"]
acs5_df.columns = acs5_cols
acs5_int_cols = ["mfi", "onwer_occ", "attached", "detached", "2_units", "3_or_4_units", "mobile", "boat_van",
             "median_yr_built"]
acs5_df['mfi'].fillna(0, inplace=True)
acs5_df['median_yr_built'].fillna(0, inplace=True)
for col in acs5_int_cols:
    acs5_df[col] = acs5_df[col].map(lambda x: int(x))
acs5_df['total_1_to_4'] = acs5_df.apply(lambda x: x.attached + x.detached + x['2_units'] + x['3_or_4_units']
                                       + x.mobile + x.boat_van, axis=1)
acs5_df.drop(["attached", "detached", "2_units", "3_or_4_units", "mobile", "boat_van"], inplace=True, axis=1)
acs5_df.head()

https://api.census.gov/data/2010/acs5?get=B19013_001E,B25032_002E,B25024_002E,B25024_003E,B25024_004E,B25024_005E,B25024_010E,B25024_011E,B25035_001E&for=tract:*&in=state:01


Unnamed: 0,mfi,onwer_occ,median_yr_built,state,county,tract,total_1_to_4
1,70222,598,1976,1,1,20100,751
2,41091,439,1976,1,1,20200,816
3,44031,957,1976,1,1,20300,1349
4,56627,1549,1969,1,1,20400,1906
5,68317,2406,1997,1,1,20500,3295


In [119]:
lar_data

Unnamed: 0,MSA_MD,State_Code,County_Code,Census_Tract_Number,Area_Population,Minority_Population_pct,FFIEC_Median_Family_Income,Tract_to_MSA_MD_Income,Num_of_Owner_occupied_units,Num_of_1_to_4_Family_units,fips
0,33860,01,001,0201.00,00001912,016.27,00060400,132.09,00000598,00000751,01001020100
1,33860,01,001,0203.00,00003373,024.76,00060400,091.74,00000957,00001349,01001020300
2,33860,01,001,0208.02,00010435,017.12,00060400,118.13,00003113,00003865,01001020802
3,33860,01,001,0205.00,00010766,021.62,00060400,137.89,00002406,00003295,01001020500
4,33860,01,001,0207.00,00002891,020.65,00060400,072.34,00000710,00001246,01001020700
5,33860,01,001,0210.00,00002894,023.70,00060400,093.88,00000931,00001281,01001021000
6,33860,01,001,0209.00,00005675,015.67,00060400,098.60,00001676,00002210,01001020900
7,33860,01,001,0202.00,00002170,061.11,00060400,084.60,00000439,00000816,01001020200
8,33860,01,001,0211.00,00003320,056.81,00060400,089.80,00001016,00001482,01001021100
9,33860,01,001,0204.00,00004386,008.12,00060400,115.73,00001549,00001906,01001020400
