## hud_api_2 notebook

In [1]:
import numpy as np
import pandas as pd
import os
import requests
from env import hud_token

- This notebook focuses on working out the data wrangling for the HUD .csv file that provides FMR data for aggregated areas. In the hud_api_test notebook, I explored using the HUD FMR Small Area data. We may ultimately use that data as well, but for the scope of MVP we have decided to use the data that is aggregated by Metropolitan Statistical Area because it better matches the market rental data we can retrieve from publicly available sources.

- The data in this .csv is provided by county, but counties belonging to MSAs have matching data for all relevant years indicating that they are indeed calculated in aggregate for the entire MSA region.

In [2]:
# get the entire national data into a df
df = pd.read_csv('FMR_All_1983_2023_rev.csv', encoding='latin1')
df.head()


Unnamed: 0,fips2010,fips2000,areaname23,name,msa23,fmr23_0,fmr23_1,fmr23_2,fmr23_3,fmr23_4,...,pop2010,fmr_area,census_region,pmsaname,cntyname,pop2017,pop2000,id_agis3,id_agis2,id
0,100199999,100199999.0,"Montgomery, AL MSA",Autauga County,METRO33860M33860,716,817,977,1241,1595,...,54571,5240.0,3.0,"Montgomery, AL MSA",Autauga County,55035,43671.0,MSA5240,MSA5240,100000001.0
1,100399999,100399999.0,"Daphne-Fairhope-Foley, AL MSA",Baldwin County,METRO19300M19300,924,928,1206,1534,1971,...,182265,5160.0,3.0,"Mobile, AL MSA",Baldwin County,203360,140415.0,MSA5160,MSA5160,100000003.0
2,100599999,100599999.0,"Barbour County, AL",Barbour County,NCNTY01005N01005,558,562,740,941,994,...,27457,10000005.0,3.0,"Barbour County, AL",Barbour County,26200,29038.0,CNTY01005,CNTY01005,100000005.0
3,100799999,100799999.0,"Birmingham-Hoover, AL HUD Metro FMR Area",Bibb County,METRO13820M13820,866,942,1075,1376,1494,...,22915,10000007.0,3.0,"Bibb County, AL",Bibb County,22580,20826.0,CNTY01007,CNTY01007,100000007.0
4,100999999,100999999.0,"Birmingham-Hoover, AL HUD Metro FMR Area",Blount County,METRO13820M13820,866,942,1075,1376,1494,...,57322,1000.0,3.0,"Birmingham, AL MSA",Blount County,57665,51024.0,MSA1000,MSA1000,100000009.0


In [3]:
# filter the data to rows belonging to our MSA
df[df.msa23 == 'METRO41700M41700']

Unnamed: 0,fips2010,fips2000,areaname23,name,msa23,fmr23_0,fmr23_1,fmr23_2,fmr23_3,fmr23_4,...,pop2010,fmr_area,census_region,pmsaname,cntyname,pop2017,pop2000,id_agis3,id_agis2,id
3829,4801999999,4802000000.0,"San Antonio-New Braunfels, TX HUD Metro FMR Area",Bandera County,METRO41700M41700,918,1057,1282,1631,1987,...,20485,480000019.0,3.0,"Bandera County, TX",Bandera County,21315,17645.0,CNTY48019,CNTY48019,4800000000.0
3834,4802999999,4803000000.0,"San Antonio-New Braunfels, TX HUD Metro FMR Area",Bexar County,METRO41700M41700,918,1057,1282,1631,1987,...,1714773,7240.0,3.0,"San Antonio, TX MSA",Bexar County,1892005,1392931.0,MSA7240,MSA7240,4800000000.0
3865,4809199999,4809200000.0,"San Antonio-New Braunfels, TX HUD Metro FMR Area",Comal County,METRO41700M41700,918,1057,1282,1631,1987,...,108472,7240.0,3.0,"San Antonio, TX MSA",Comal County,129100,78021.0,MSA7240,MSA7240,4800000000.0
3913,4818799999,4818800000.0,"San Antonio-New Braunfels, TX HUD Metro FMR Area",Guadalupe County,METRO41700M41700,918,1057,1282,1631,1987,...,131533,7240.0,3.0,"San Antonio, TX MSA",Guadalupe County,150890,89023.0,MSA7240,MSA7240,4800000000.0
4066,4849399999,4849400000.0,"San Antonio-New Braunfels, TX HUD Metro FMR Area",Wilson County,METRO41700M41700,918,1057,1282,1631,1987,...,42918,7240.0,3.0,"San Antonio, TX MSA",Wilson County,47205,32408.0,MSA7240,MSA7240,4800000000.0


In [4]:
# this is, in fact, all the counties in the San Antonio-NewBraunfels MSA

In [5]:
# going to test a function to groupby and aggregate the county names to a list
def string_to_list(x):
    return list(x)

In [6]:
entity_id = 'METRO41700M41700'

In [7]:
# Now grab the columns of interest
def get_hud_macro_data(entity_id):
    '''
    '''
    df = pd.read_csv('FMR_All_1983_2023_rev.csv', encoding='latin1')
    df = df[df.msa23 == entity_id].reset_index(drop=True)
    cols = ['fmr23_2', 'fmr22_2', 'fmr21_2', 'fmr20_2', 'fmr19_2', 'fmr18_2', 'fmr17_2']
    df = df[cols]
    df.rename(columns={'fmr23_2':'2023', 'fmr22_2':'2022','fmr21_2':'2021','fmr20_2':'2020',
                      'fmr19_2':'2019','fmr18_2':'2018','fmr17_2':'2017',}, inplace=True)
    df = df.groupby('2023').first().reset_index().T
    df = df.rename(columns={0:'fmr'})
    df['date'] = pd.to_datetime(df.index, format='%Y-%m')
    df.set_index('date', inplace=True)
    return df

In [8]:
df = get_hud_macro_data(entity_id)
df

Unnamed: 0_level_0,fmr
date,Unnamed: 1_level_1
2023-01-01,1282
2022-01-01,1165
2021-01-01,1114
2020-01-01,1051
2019-01-01,1050
2018-01-01,1001
2017-01-01,964


In [9]:
df_shifted = df.resample('M').ffill().shift(periods=-3, freq='M')
df_shifted

Unnamed: 0_level_0,fmr
date,Unnamed: 1_level_1
2016-10-31,964
2016-11-30,964
2016-12-31,964
2017-01-31,964
2017-02-28,964
...,...
2022-06-30,1165
2022-07-31,1165
2022-08-31,1165
2022-09-30,1165


In [10]:
df_shifted.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 73 entries, 2016-10-31 to 2022-10-31
Freq: M
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   fmr     73 non-null     int64
dtypes: int64(1)
memory usage: 1.1 KB


In [11]:
df_shifted.index = df_shifted.index.strftime('%Y-%m')
df_shifted

Unnamed: 0_level_0,fmr
date,Unnamed: 1_level_1
2016-10,964
2016-11,964
2016-12,964
2017-01,964
2017-02,964
...,...
2022-06,1165
2022-07,1165
2022-08,1165
2022-09,1165


In [14]:
df_shifted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73 entries, 2016-10 to 2022-10
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   fmr     73 non-null     int64
dtypes: int64(1)
memory usage: 1.1+ KB


In [12]:
index = pd.date_range('2023-02-28', periods=5, freq='M')
series = pd.Series(range(5), index=index)

In [13]:
# df.groupby('msa').agg({'entity_id':'first', 'counties':string_to_list, '2023':'first', 
#                       '2022':'first','2021':'first', '2020':'first', '2019':'first', 
#                        '2018':'first', '2017':'first'}).T