In [1]:
import pandas as pd
import numpy as np

import csv
from scipy import stats
import requests
from datetime import datetime
import os
import geopandas

from env import hud_token

In [5]:
### Wrangle Dataset 1 ###

# HUD FMR for SA/NB, TX Metro Area
def get_hud_metro_data():
    '''
    Arguments: a HUD entity id
    Returns: a DataFrame of all info from csv
    '''
    df = pd.read_csv('FMR_All_1983_2023_rev.csv', encoding='latin1')

    return df

def clean_hud_metro_data(entity_id = 'METRO41700M41700'):
    '''
    This function cleas the hud macro data.
    Steps:
        Filter by entity id
        Keep columns of interest 
        Rename columns
        Transpose DF
        Change index to datetime
        Resample to monthly, fill the nulls, and shift the dates by 3 months
    '''
    # get data
    df = get_hud_metro_data()
    
    # filter by entity id, reset index, and drop old index
    df = df[df.msa23 == entity_id].reset_index(drop=True)

    # filter by columns of interest
    cols = ['fmr23_2', 'fmr22_2', 'fmr21_2', 'fmr20_2', 'fmr19_2', 'fmr18_2', 'fmr17_2']
    df = df[cols]
    
    # rename columns for human friendliness
    df.rename(columns={'fmr23_2':'2023', 'fmr22_2':'2022','fmr21_2':'2021','fmr20_2':'2020',
                  'fmr19_2':'2019','fmr18_2':'2018','fmr17_2':'2017',}, inplace=True)
    
    # get only the first values and transpose the df
    df = df.iloc[[0]].T

    # rename the col
    df = df.rename(columns={0:'fmr'})

    # change index to datetime
    df.index = pd.to_datetime(df.index)

    # resamnple to monthly and shift to october as the strt date to match federal fiscal years
    df = df.resample('M').ffill().shift(periods=-3, freq='M')
    
    # change index to string form
    df.index = df.index.strftime('%Y-%m')
    
    # exit and return the df
    return df


def get_mmr_data():
    '''
    Open MMR csv and provide it in a df 
    '''
    #Acquire the data from csv
    df = pd.read_csv('HUDpro_amr_unmerged.csv')
    
    # exit and return df
    return df

def clean_mmr_data():
    '''
    This function gets the mmr data and cleans it removing unnecessary columns and rows.
    The function also sets the index to datetime and converts the mmr column to a integer
    '''
    # get data
    df = get_mmr_data()

    # Create a dataframe that only contains information for San Antonio
    df = df.loc[(df['City']== 'SAN ANTONIO, TX')].T

    # drop unnecessayr rows and columns, rename the index, rename the focus variable 'mmr'
    df = df.drop(index=['City', 'Beds'], columns= [246, 248]).rename_axis('date').rename({247:'mmr'}, axis=1)

    # changing index to date
    df.index = pd.to_datetime(df.index)

    # Remove non-integer values from mmr column
    df['mmr']= df['mmr'].str.replace('$','').str.replace(',','').astype(int)

    return df

def wrangle_metro_data():
    '''
    Arguments: none
    Actions: gets both data sets, changes the fmr index to datetime, merges bothe data sets, imputes missing fmr data with last value, drops the mmr data, adds column with the difference between them
    Returns: merged data frame ready for exploration
    Modules:
        import pandas as pd
        from prepare_hud_aggregate import get_hud_macro_data
        from wrangle_HUDpro_amr_data import get_sanant_amr_data
    Notes: csv's required for hud and sanantonio market rent data must be in the same folder as this function for it to work 
    '''
    # get data
    hud = clean_hud_metro_data()
    mmr = clean_mmr_data()
    
    # converting the hud index datetime
    hud.index = pd.to_datetime(hud.index)

    # creating merged df
    df = pd.merge(left=hud, right=mmr, how='outer', right_index=True, left_index=True)
    
    # filling in with the correct number
    df['fmr'].fillna(1286, inplace=True)
    
    # dropping the null values that are not necessary
    df.dropna(inplace=True)
    
    # creating the difference 
    df['diff'] =  df['mmr'] - df['fmr']
    
    # creating the percent difference in terms of fmr
    df['percent_diff'] =  (df['mmr'] - df['fmr']) / df['fmr']
    
    # splitting 12 month(test), 24month(validate), 39 month(train)
    test = df[-12:]
    validate = df[-36:-12]
    train = df[:-36]
    
    # return the merged df, train df, and test df
    return df, train, validate, test

#### Wrangle Dataset 2 ####
def get_hud_zipcode_data():
    '''
    Arguments: none
    Actions:
    Returns:
    Modules:
        import pandas as pd
        imoprt requests
        from env import hud_token
        from datetime import datetime
    '''
    # a variable to hold the xpected or future file name
    filename = '''hud_zipcode_data.csv'''
    
    # if the file is present in the directory 
    if os.path.isfile(filename):
      
        # read the csv and assign it to the variable df
        df = pd.read_csv(filename, index_col=0)
        
        # return the dataframe and exit the funtion
        return df
    
    # if the file is not present
    else: 
    
        # set header for the api request
        headers = {'Authorization': f'Bearer {hud_token}'}

        # get a list of years
        years = pd.period_range(start=2018, end=datetime.now().year, freq='Y').to_series().astype(str).reset_index(drop=True).to_list()

        # intialize a df
        df = pd.DataFrame()

        # for each year
        for year in years:

            # insert entity id and year into the url
            url = f'https://www.huduser.gov/hudapi/public/fmr/data/METRO41700M41700?year={year}'

            # store the reposnse
            response = requests.get(url, headers=headers)

            # take out the data as a dict
            data = response.json()

            # create a dummy df to store the basic data info
            dum = pd.DataFrame(data['data']['basicdata'])

            # add a year column
            dum['year'] = data['data']['year']

            # add the dummy 
            df = pd.concat([df, dum])
        
        # cache the data
        df.to_csv(filename)
        
        # exit and return the final df
        return df

def clean_hud_zipcode_data():
    '''
    Arguments: None
    Actions: 
        Set DateTimeIndex
        Resample the dataframe to monthly, shift 3 months for federal fiscal calendar, and forward fill missing data
        Remove unneeded columns
        Change data types
    Returns: clean hud zipcode dataframe
    Modules:
        from datetime import datetime
        import pandas as pd
        from wrangle import get_hud_zipcode_data
    '''
    # get data
    hud_zip = get_hud_zipcode_data()

    # convert to year format
    hud_zip['date'] = pd.to_datetime(hud_zip['year'], format='%Y')

    # set the index
    hud_zip = hud_zip.set_index('date')

    # initialize a dataframe
    df = pd.DataFrame()

    # get all the unique zip codes
    zip_codes = hud_zip.zip_code.unique().tolist()

    # for each zip code
    for zip_code in zip_codes:

        # isolate the zipcode
        df_zip = hud_zip[hud_zip.zip_code == zip_code]

        # resample and shift 3 months
        df_zip = df_zip.resample('M').ffill().shift(periods=-3, freq='M')

        # get a range of missing months after the dhift
        missing_dates = pd.period_range(start = df_zip.index[-1], end = datetime.now(), freq='M')

        # concatenate the individual zipcode with the missing index dates and forward fill the missing components
        df_zip = pd.concat([df_zip, pd.DataFrame(index=missing_dates)]).ffill()

        # add the finished zipcode df to the complete df
        df = pd.concat([df, df_zip])

    # set index name 
    df.index.name = 'date'
    
    # set index to datetime and str ro remove the ending
    df.index = pd.to_datetime(df.index.astype(str)).strftime('%Y-%m')
    
    # reset to date time to have dates that start at the beginning of the month
    df.index = pd.to_datetime(df.index)
    
    # drop unnecessary columns and rename column
    df = df.drop(['Efficiency', 'One-Bedroom', 'Three-Bedroom',
       'Four-Bedroom', 'year'], axis=1).rename({'Two-Bedroom':'two_bed_fmr'}, axis=1)

    # change zipcode dtype
    df['zip_code'] = df['zip_code'].astype(int).astype(str)
    
    
    return df

# 
def clean_rapid_zipcode_data(filename = 'rapid_zipcode_data.csv'):
    '''
    Argument: the filename for the rapid_api csv data as a string literal
    Actions: This extracts the important historical data from the messy csv acquired from rapid api and returns a dataframe
    Modules:
        import pandas as pd
    '''
    
    # get the data from the csv name
    df = pd.read_csv(filename, index_col=0)

    # convert strings into dictionaries and assign the series to a variable
    rental_data = df.T.rentalData.apply(eval)
    
    # initialize df
    df = pd.DataFrame()

    # for each zipcode in the rental_data keys
    for zipcode in rental_data.keys():

        # For each date in the in history for each zipcode
        for date in rental_data[zipcode]['history'].keys():

            # lastly, for each dictionary in the detailed data based on zipcode and date
            for i, n in enumerate(rental_data[zipcode]['history'][date]['detailed']):
                
                # dictionary to hold date and zipcode 
                new_info = {'date':pd.to_datetime(date), 'zipcode':zipcode}

                # add date to the dict 
                rental_data[zipcode]['history'][date]['detailed'][i].update(new_info)

            # created a place holder df with that informatiuon
            dum = pd.DataFrame(rental_data[zipcode]['history'][date]['detailed'])

            # combine it with the intialized df and save as the new one
            df = pd.concat([df, dum])
    
    # set index to date
    df = df.set_index('date').sort_index()
    
    # only 2 bedrooms 
    df = df[df['bedrooms'] == 2]
    
    # drop bedrooms
    df = df.drop('bedrooms', axis=1)
    
    # make list of new column names
    new_cols = ['average_rent', 'min_rent', 'max_rent', 'num_properties', 'zip_code']
    
    # rename columns
    df = df.rename(columns = dict(zip(df.columns, new_cols)))
    
    # exit function and return newly created df
    return df

def wrangle_zipcode_data():
    '''
    Arguments: none
    Actions:
        Gets data
        Merges both hud and rapid api datasets
        Creates new features
    Returns: wrangled df
    Modules:
        import pandas as pd
    '''
    # get cleaned zipcode data
    hud = clean_hud_zipcode_data()
    rapid = clean_rapid_zipcode_data()

    # combine the two dataframes on date and zip code
    df = rapid.merge(hud, how='inner', on=['date', 'zip_code'])

    # add column 'diff' for absolute difference between MMR and FMR
    df['diff'] = df.average_rent - df.two_bed_fmr

    # add column 'percent_diff' for percent difference in MMR and FMR
    df['percent_diff'] = ((df.average_rent - df.two_bed_fmr) / df.two_bed_fmr) * 100

    # getting different areas of affordability
    df['afford_min'] = df['min_rent'] - df['two_bed_fmr'] <= 0
    df['afford_avg'] = df['average_rent'] - df['two_bed_fmr'] <= 0
    df['afford_max'] = df['max_rent'] - df['two_bed_fmr'] <= 0

    # getting an affordability score using all the true and falses
    df['affordability'] = df[['afford_min', 'afford_max', 'afford_avg']].sum(axis=1)

    # exit function and return wrangled df
    return df

### Wrangle Dataset 3 ###
def clean_zcta_gdf(filename = 'zcta_of_interest.shp'):
    '''
    Arguments:
    Actions: This function creates a GeoDataFrame with the zip code tabulation areas that are present in the San Antonio, New Braunfels metro area as defined by the Census Bureau and HUD. 
        Prepares the GeoDataFrame for merging with other DataFrames
    '''
    # read file
    gdf = geopandas.read_file(filename)

    # rename for merge
    gdf = gdf.rename({'ZCTA5CE20':'zip_code'}, axis=1)

    # drop unnecessary columns
    gdf = gdf.drop(['GEOID20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20',
           'AWATER20', 'INTPTLAT20', 'INTPTLON20'], axis=1)

    return gdf

def wrangle_gdf():
    '''
    Arguments: None
    Actions:
        Loads data
        Gets columns of interest
        Merges the zipcode data to the gdf 
    Returns: a geodataframe with affordability information for 2023-05
    Module: 
        import geopandas
        import pandas as pd
    '''
    # get zipcode data
    df = wrangle_zipcode_data()
    gdf = clean_zcta_gdf()
    
    # columns of interest
    cols = ['zip_code', 'affordability', 'num_properties']
    
    # get the dates of interest
    df = df.loc['2023-05'][cols]
    
    # merging data
    gdf = df.merge(gdf, how='inner', on='zip_code')

    # setting index to zipcode
    gdf = gdf.set_index('zip_code')

    # set to gdf
    gdf = geopandas.GeoDataFrame(gdf)
    
    return gdf

In [234]:
wrangle_zipcode_data()

Unnamed: 0_level_0,average_rent,min_rent,max_rent,num_properties,zip_code,two_bed_fmr,diff,percent_diff,afford_min,afford_avg,afford_max,affordability
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-04-01,750.00,750,750,2,78002,890.0,-140.00,-15.730337,True,True,True,3
2020-04-01,898.82,625,1250,22,78221,900.0,-1.18,-0.131111,True,True,False,2
2020-04-01,860.00,625,1095,2,78222,970.0,-110.00,-11.340206,True,True,False,2
2020-04-01,916.40,800,1050,5,78219,980.0,-63.60,-6.489796,True,True,False,2
2020-04-01,1250.00,1250,1250,1,78052,720.0,530.00,73.611111,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-01,1371.00,800,1865,36,78247,1500.0,-129.00,-8.600000,True,True,False,2
2023-06-01,1712.00,1195,3190,9,78245,1410.0,302.00,21.418440,True,False,False,1
2023-06-01,1272.00,675,2430,33,78213,1300.0,-28.00,-2.153846,True,True,False,2
2023-06-01,1629.00,900,3600,117,78212,1190.0,439.00,36.890756,True,False,False,1


In [226]:
hud.index

Index(['2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03',
       '2018-04', '2018-05', '2018-06', '2018-07',
       ...
       '2022-10', '2022-10', '2022-11', '2022-12', '2023-01', '2023-02',
       '2023-03', '2023-04', '2023-05', '2023-06'],
      dtype='object', name='date', length=9576)

In [222]:
hud.index.name = 'date'

In [213]:
# clean_rapid_zipcode_data()

In [200]:
# df = get_hud_zipcode_data()

In [129]:
# get data
df = get_hud_zipcode_data()

# convert to year format
df['date'] = pd.to_datetime(df['year'], format='%Y')

# set the index
df = df.set_index('date')

# # resample based on the zipcode
# df = df.groupby('zip_code').resample('M').ffill()

# # drop the zpicode index
# df = df.droplevel(0)


In [91]:
missing_months = pd.DataFrame(columns=df.columns, index=pd.period_range(start = df.index[-1], end = datetime.now(), freq='M'))

In [76]:
df = df.shift(periods=-3, freq='M')

In [97]:
df.reindex()

Unnamed: 0_level_0,zip_code,Efficiency,One-Bedroom,Two-Bedroom,Three-Bedroom,Four-Bedroom,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,78002,570,710,880,1170,1420,2018
2017-11-30,78002,570,710,880,1170,1420,2018
2017-12-31,78002,570,710,880,1170,1420,2018
2018-01-31,78002,570,710,880,1170,1420,2018
2018-02-28,78002,570,710,880,1170,1420,2018
...,...,...,...,...,...,...,...
2022-06-30,78885,720,850,1030,1330,1630,2022
2022-07-31,78885,720,850,1030,1330,1630,2022
2022-08-31,78885,720,850,1030,1330,1630,2022
2022-09-30,78885,720,850,1030,1330,1630,2022


In [155]:
# get data
df = get_hud_zipcode_data()

# convert to year format
df['date'] = pd.to_datetime(df['year'], format='%Y')

# set the index
df = df.set_index('date')

# initialize a dataframe
df_monthly = pd.DataFrame()

# get all the unique zip codes
zip_codes = df.zip_code.unique().tolist()

# for each zip code
for zip_code in zip_codes:

    # isolate the zipcode
    df_zip = df[df.zip_code == zip_code]

    # resample and shift 3 months
    df_zip = df_zip.resample('M').ffill().shift(periods=-3, freq='M')

    # get a range of missing months after the dhift
    missing_dates = pd.period_range(start = df_zip.index[-1], end = datetime.now(), freq='M')

    # concatenate the individual zipcode with the missing index dates and forward fill the missing components
    df_zip = pd.concat([df_zip, pd.DataFrame(index=missing_dates)]).ffill()

    # add the finished zipcode df to the complete df
    df_monthly = pd.concat([df_monthly, df_zip])

# pd.to_datetime(df_monthly.index.to_series())

In [172]:
df_monthly.index = pd.to_datetime(df_monthly.index.astype(str)).strftime('%Y-%m')

# drop unnecessary columns and rename column
df_monthly = df_monthly.drop(['Efficiency', 'One-Bedroom', 'Three-Bedroom',
       'Four-Bedroom', 'year'], axis=1).rename({'Two-Bedroom':'two_bed_fmr'}, axis=1)

# change zipcode type
df_monthly['zip_code'] = df_monthly['zip_code'].astype(int).astype(str)

In [177]:
df_monthly['zip_code'] = df_monthly['zip_code'].astype(int).astype(str)

In [186]:
df = clean_hud_zipcode_data()

In [180]:
df

Unnamed: 0,zip_code,two_bed_fmr
2017-10,78002,880.0
2017-11,78002,880.0
2017-12,78002,880.0
2018-01,78002,880.0
2018-02,78002,880.0
...,...,...
2023-02,78285,1270.0
2023-03,78285,1270.0
2023-04,78285,1270.0
2023-05,78285,1270.0


In [3]:
get_entity_data()

Unnamed: 0_level_0,zip_code,Efficiency,One-Bedroom,Two-Bedroom,Three-Bedroom,Four-Bedroom,year,entity_id,area_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-01,78002,570.0,710.0,880.0,1170.0,1420.0,2018,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2017-11-01,78002,570.0,710.0,880.0,1170.0,1420.0,2018,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2017-12-01,78002,570.0,710.0,880.0,1170.0,1420.0,2018,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2018-01-01,78002,570.0,710.0,880.0,1170.0,1420.0,2018,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2018-02-01,78002,570.0,710.0,880.0,1170.0,1420.0,2018,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
...,...,...,...,...,...,...,...,...,...
2023-06-01,78285,910.0,1050.0,1270.0,1620.0,1970.0,2023,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2023-07-01,78285,910.0,1050.0,1270.0,1620.0,1970.0,2023,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2023-08-01,78285,910.0,1050.0,1270.0,1620.0,1970.0,2023,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"
2023-09-01,78285,910.0,1050.0,1270.0,1620.0,1970.0,2023,METRO41700M41700,"San Antonio-New Braunfels, TX HUD Metro FMR Area"


In [237]:
clean_zcta_gdf()

Unnamed: 0,zip_code,geometry
0,78006,"POLYGON ((-98.91927 29.85101, -98.91909 29.851..."
1,78010,"POLYGON ((-99.17946 29.90463, -99.17946 29.904..."
2,78163,"POLYGON ((-98.61155 29.74349, -98.61154 29.744..."
3,78240,"POLYGON ((-98.65198 29.53973, -98.65188 29.539..."
4,78224,"POLYGON ((-98.57500 29.31152, -98.57490 29.311..."
...,...,...
107,78670,"POLYGON ((-97.87357 29.76387, -97.87326 29.764..."
108,78850,"POLYGON ((-99.41386 29.09301, -99.41386 29.093..."
109,78885,"POLYGON ((-99.60332 29.74289, -99.60304 29.750..."
110,78676,"POLYGON ((-98.32806 30.07042, -98.32803 30.070..."


In [239]:
# get zipcode data
df = wrangle_zipcode_data()
gdf = clean_zcta_gdf()

# columns of interest
cols = ['zip_code', 'affordability', 'num_properties']

# get the dates of interest
df = df.loc['2023-05'][cols]

# merging data
gdf = df.merge(gdf, how='inner', on='zip_code')

# setting index to zipcode
gdf = gdf.set_index('zip_code')

# set to gdf
gdf = geopandas.GeoDataFrame(gdf)

gdf

Unnamed: 0_level_0,affordability,num_properties,geometry
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78148,2,17,"POLYGON ((-98.33303 29.56820, -98.33140 29.569..."
78257,1,14,"POLYGON ((-98.65716 29.70703, -98.65699 29.707..."
78154,2,11,"POLYGON ((-98.34173 29.58579, -98.33631 29.589..."
78152,3,2,"POLYGON ((-98.27022 29.43175, -98.26780 29.431..."
78124,1,5,"POLYGON ((-98.21084 29.60410, -98.20624 29.606..."
...,...,...,...
78606,0,1,"POLYGON ((-98.71107 30.11579, -98.70896 30.116..."
78010,0,1,"POLYGON ((-99.17946 29.90463, -99.17946 29.904..."
78261,2,4,"POLYGON ((-98.45205 29.69366, -98.45200 29.694..."
78263,3,2,"POLYGON ((-98.36082 29.38907, -98.36082 29.390..."


In [244]:
def wrangle_gdf():
    '''
    Actions:
        Loads data
        Gets columns of interest
        Merges the zipcode data to the gdf 
    Returns: a geodataframe with affordability information for 2023-05
    Module: 
        import geopandas
        import pandas as pd
    '''
    # get zipcode data
    df = wrangle_zipcode_data()
    gdf = clean_zcta_gdf()
    
    # columns of interest
    cols = ['zip_code', 'affordability', 'num_properties']
    
    # get the dates of interest
    df = df.loc['2023-05'][cols]
    
    # merging data
    gdf = df.merge(gdf, how='inner', on='zip_code')

    # setting index to zipcode
    gdf = gdf.set_index('zip_code')

    # set to gdf
    gdf = geopandas.GeoDataFrame(gdf)
    
    return gdf

In [242]:
wrangle_gdf()

Unnamed: 0_level_0,affordability,num_properties,geometry
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78148,2,17,"POLYGON ((-98.33303 29.56820, -98.33140 29.569..."
78257,1,14,"POLYGON ((-98.65716 29.70703, -98.65699 29.707..."
78154,2,11,"POLYGON ((-98.34173 29.58579, -98.33631 29.589..."
78152,3,2,"POLYGON ((-98.27022 29.43175, -98.26780 29.431..."
78124,1,5,"POLYGON ((-98.21084 29.60410, -98.20624 29.606..."
...,...,...,...
78606,0,1,"POLYGON ((-98.71107 30.11579, -98.70896 30.116..."
78010,0,1,"POLYGON ((-99.17946 29.90463, -99.17946 29.904..."
78261,2,4,"POLYGON ((-98.45205 29.69366, -98.45200 29.694..."
78263,3,2,"POLYGON ((-98.36082 29.38907, -98.36082 29.390..."


In [2]:



# Creating a function for it 
def wrangle_data():
    '''
    Arguments: none
    Actions: gets both data sets, changes the fmr index to datetime, merges bothe data sets, imputes missing fmr data with last value, drops the mmr data, adds column with the difference between them
    Returns: merged data frame ready for exploration
    Modules:
        import pandas as pd
        from prepare_hud_aggregate import get_hud_macro_data
        from wrangle_HUDpro_amr_data import get_sanant_amr_data
    Notes: csv's required for hud and sanantonio market rent data must be in the same folder as this function for it to work 
    '''
    # get data
    hud = get()
    amr = get_sanant_amr_data()
    
    # converting the hud index datetime
    hud.index = pd.to_datetime(hud.index)

    # creating merged df
    df = pd.merge(left=hud, right=amr, how='outer', right_index=True, left_index=True)
    
    # filling in with the correct number
    df['fmr'].fillna(1286, inplace=True)
    
    # dropping the null values that are not necessary
    df.dropna(inplace=True)
    
    
    # creating the difference 
    df['diff'] =  df['mmr'] - df['fmr']
    
    # creating the percent difference in terms of fmr
    df['percent_diff'] =  (df['mmr'] - df['fmr']) / df['fmr']
    
    
    # splitting 12 month(test), 24month(validate), 39 month(train)
    test = df[-12:]
    validate = df[-36:-12]
    train = df[:-36]
    
    # return the merged df, train df, and test df
    return df, train, validate, test



def wrangle_micro_data(filename):
    '''
    This function combines the get_hud_micro_data output and the 
    rapid_api_mvp_prep output. First it makes the column names 
    pythonic, then adds columns for the absolute difference between 
    AMR and FMR, the percent_difference between AMR and FMR in terms 
    of FMR and .
    
    Arguments: the filename of the rapid api data .csv as a string.
    Returns: a dataframe of ZIP Code level market and FMR data for
            two-bedroom properties.
    '''
    
    # get prepped rapid api data from wrangle_rapid.py
    df_rapid = rapid_api_mvp_prep(filename) 
    
    # get column names as a list
    cols = df_rapid.columns.to_list()
    
    # establish a list of more pythonic column names
    new_cols = ['bedrooms', 'average_rent', 'min_rent',
            'max_rent', 'num_properties', 'zip_code']
    
    # zip old and new column names together into a dictionary
    # and use dict to rename df_rapid columns
    df_rapid = df_rapid.rename(columns = dict(zip(cols, new_cols)))
    
    # get hud zip code level data from wrangle_hud_micro
    df_hud = get_hud_micro_data()
    
    # combine the two dataframes on date and zip code
    df = df_rapid.merge(df_hud, how='inner', on=['date', 'zip_code'])
    
    # add column 'diff' for absolute difference between AMR and FMR
    df['diff'] = df.average_rent - df.two_bed_fmr
    
    # add column 'percent_diff' for percent difference in AMR and FMR
    # in terms of AMR
    df['percent_diff'] = ((df.average_rent - df.two_bed_fmr) / df.two_bed_fmr) * 100
    
    # getting different areas of affordability
    df['afford_min'] = df['min_rent'] - df['two_bed_fmr'] <= 0
    df['afford_avg'] = df['average_rent'] - df['two_bed_fmr'] <= 0
    df['afford_max'] = df['max_rent'] - df['two_bed_fmr'] <= 0

    # getting an affordability score using all the true and falses
    df['affordability'] = df[['afford_min', 'afford_max', 'afford_avg']].sum(axis=1)
    
    return df

### Additional Wrangles ###
def wrangle_zipcode_gdf(filename = 'zcta_of_interest.shp'):
    '''
    Creates a GeoDataFrame with the zip code tabulation areas that are present in the San Antonio, New Braunfels metro area as defined by the Census Bureau and HUD. 
    Prepares the GeoDataFrame for merging with other DataFrames
    '''
    # read file
    gdf = geopandas.read_file(filename)

    # rename for merge
    gdf = gdf.rename({'ZCTA5CE20':'zip_code'}, axis=1)

    # drop unnecessary columns
    gdf = gdf.drop(['GEOID20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20',
           'AWATER20', 'INTPTLAT20', 'INTPTLON20'], axis=1)

    return gdf

def wrangle_affordability_gdf(df = wrangle_micro_data('rapid_api.csv'), date = '2023-05', columns = ['zip_code', 'affordability', 'num_properties']):
    '''
    Argument: A date in this format 'YYYY-MM'
    This function takes a date as an argument and returns the affordability of each zip code in that date.
    The data used is the combination of the rapid api realty mole adata and the hud api data.
    Module: 
        from wrangle import wrangle_micro_data
    '''
    # get zipcode tab data
    gdf = wrangle_zipcode_gdf()
    
    # get the dates of interest
    afford_df = df.loc[date][columns]
    
    # merging data
    afford_gdf = afford_df.merge(gdf, how='inner', on='zip_code')

    # setting index to zipcode
    afford_gdf = afford_gdf.set_index('zip_code')

    # set to gdf
    afford_gdf = geopandas.GeoDataFrame(afford_gdf)
    
    return afford_zcta_gdf
    