In [237]:
import pandas as pd

# Read in the data
df1 = pd.read_csv('sam-resources/Metro_median_sale_price_uc_sfrcondo_month_imputed.csv')
dfog = pd.read_csv('sam-resources/census_data_2009-2022ksada.csv')
df2 = pd.read_csv('sam-resources/median_income_2009-2022.csv')
df4 = pd.read_csv('sam-resources/Median_list_Price_imputed.csv')




In [238]:
def merge_census_data(df1, df2):
    """
    This function merges the census data with the median income data

    :param df1: The dataframe with the census data
    :param df2: The dataframe with the median income data
    :return: The merged dataframe
    """
    # Merge dfog and df2 on the 'City' and 'Year' columns and call the new dataframe census_data_df
    census_data_df = pd.merge(df1, df2, on=['City', 'Year'] , how = 'outer')
    # Fill NaN values with the median income from the other column
    census_data_df['Median Income'] = census_data_df['Median Income_x'].fillna(census_data_df['Median Income_y'])
    # Drop the columns that are not needed
    census_data_df = census_data_df.drop(columns=['Median Income_x', 'Median Income_y'])
    return census_data_df

df2 = merge_census_data(dfog, df2)
df2.head()

Unnamed: 0,City,Total Population,Total Labor Force,Unemployed Labor Force,Year,Median Income
0,"Pine Flat CDP, CA",114.0,22.0,4.0,2011,15724.0
1,"Pine Grove CDP, CA",2573.0,1098.0,64.0,2011,52917.0
2,"Pine Mountain Lake CDP, CA",2695.0,942.0,100.0,2011,54200.0
3,"Piñon Hills CDP, CA",6130.0,2510.0,503.0,2011,38140.0
4,"Pioneer CDP, CA",1226.0,621.0,109.0,2011,42917.0


In [239]:
def calculate_housing_affordability_index(med_price, interest_rate, med_income):
    """
    Calculate the Housing Affordability Index (HAI)

    :param med_price: Median price of existing single-family home sale
    :param interest_rate: Interest rate (annual)
    :param med_income: Median family income
    :return: Housing Affordability Index
    """
    # Convert annual interest rate to monthly and calculate the monthly payment (PMT)
    monthly_interest_rate = interest_rate / 12 / 100
    pmt = med_price * 0.8 * (monthly_interest_rate) / (1 - (1 / ((1 + monthly_interest_rate)**360)))

    # Calculate Qualifying Income (QINC)
    qualifying_income = pmt * 4 * 12

    # Calculate Housing Affordability Index
    hai = (med_income / qualifying_income) * 100

    return hai

# Example usage
example_med_price = 300000  # Example median price
example_interest_rate = 0.04  # Example interest rate (4%)
example_med_income = 60000  # Example median income

housing_affordability_index = calculate_housing_affordability_index(example_med_price, example_interest_rate, example_med_income)
print(housing_affordability_index)


186.37639887651713


In [240]:
df1.head()

Unnamed: 0,RegionName,RegionType,StateName,RegionID,SizeRank,2008-02-29,2008-03-31,2008-04-30,2008-05-31,2008-06-30,...,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30,2023-10-31
0,United States,country,,102001.0,0.0,174000.0,179000.0,179900.0,180222.0,186000.0,...,300000.0,315000.0,325000.0,334552.0,345000.0,350000.0,350000.0,345900.0,335000.0,335000.0
1,"New York, NY",msa,NY,394913.0,1.0,400000.0,390000.0,395000.0,395000.0,400000.0,...,515000.0,500000.0,500000.0,540000.0,550000.0,575000.0,585000.0,600000.0,577500.0,560000.0
2,"Los Angeles, CA",msa,CA,753899.0,2.0,470000.0,455000.0,458500.0,440000.0,435000.0,...,805000.0,825000.0,845000.0,850000.0,860000.0,885000.0,890000.0,895000.0,885000.0,894000.0
3,"Chicago, IL",msa,IL,394463.0,3.0,224000.0,230000.0,228000.0,235000.0,242000.0,...,260000.0,264000.0,284900.0,295000.0,310000.0,322000.0,315000.0,310000.0,300000.0,290000.0
4,"Dallas, TX",msa,TX,394514.0,4.0,138000.0,146000.0,144950.0,150000.0,155460.0,...,350000.0,365894.0,370000.0,380469.0,390000.0,400000.0,395000.0,386567.0,378900.0,375000.0


# Data Cleaning

In [241]:
def clean_zillow_data(zdf, value_name='MedianSalePrice'):
    """
    Clean the Zillow data

    :param zdf: Zillow dataframe
    :return: Cleaned Zillow dataframe
    """
    # Rename the 'RegionName' column to 'Name'
    zdf = zdf.rename(columns={'RegionName': 'Name'})
    # Remove the state abbreviation from the values in the 'Name' column of zdf
    zdf['Name'] = zdf['Name'].str.split(',').str[0]
    # Rename the 'StateName' column in zdf to 'State'
    zdf = zdf.rename(columns={'StateName': 'State'})
    # Get a list of the column names in zdf that do not contain '-' or '/'
    id_vars = [col for col in zdf.columns if ('-' not in col) and ('/' not in col)]
    print(id_vars)
    # Convert zdf from wide to long format
    zdf = pd.melt(zdf, id_vars = id_vars, var_name='Date', value_name = value_name)
    cols = ['Date', 'Name', 'State', value_name]
    # Keep only the columns specified in cols
    zdf = zdf[cols]
    # Add a column to zdf that contains the year extracted from the 'Date' column
    if '-' in zdf['Date'][0]:
        zdf['Year'] = zdf['Date'].str.split('-').str[0]
    elif '/' in zdf['Date'][0]:
        zdf['Year'] = zdf['Date'].str.split('/').str[-1]
    # Remove the leading and trailing whitespace from the values in the 'State' Column of zdf
    zdf['State'] = zdf['State'].str.strip()
    # Remove all leading and trailing whitespace from the values in the 'Name' column of zdf
    zdf['Name'] = zdf['Name'].str.strip()
    # Convert 'Date' to datetime in zdf
    zdf['Date'] = pd.to_datetime(zdf['Date'])
    # Convert 'Year' to int64 in zdf
    zdf['Year'] = zdf['Year'].astype('int64')
    # Return the cleaned zillow dataframe
    return zdf

def clean_census_data(cdf, zdf):
    """
    Clean the Census data

    :param cdf: Census dataframe
    :param zdf: Zillow dataframe
    :return: Cleaned Census dataframe
    """
    # Add a column to cdf that contains the state abbreviation extracted from the 'City' column
    cdf['State'] = cdf['City'].str.split(',').str[1]
    # Rename the 'City' column to 'Name'
    cdf = cdf.rename(columns={'City': 'Name'})
    # Remove the state abbreviation from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.split(',').str[0]
    # Remove all rows in cdf that do not contain 'city' in the 'Name' column
    cdf = cdf[cdf['Name'].str.contains('city')]
    # Remove 'city' from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.split('city').str[0]
    # Remove all leading and trailing whitespace from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.strip()
    # Remove the leading and trailing whitespace from the values in the 'State' column of cdf
    cdf['State'] = cdf['State'].str.strip()
    # Convert 'Year' to int64 in cdf
    cdf['Year'] = cdf['Year'].astype('int64')
    # Filter cdf to only include rows where the 'Name' column is in the 'Name' column of zdf
    cdf = cdf[cdf['Name'].isin(zdf['Name'])]
    # Return the cleaned census dataframe
    return cdf


df1 = clean_zillow_data(df1, 'MedianSalePrice')
df2 = clean_census_data(df2, df1)
df4 = clean_zillow_data(df4, 'MedianListPrice')

['Name', 'RegionType', 'State', 'RegionID', 'SizeRank']
['Name', 'State', 'RegionID', 'SizeRank']


In [242]:
df4.head()

Unnamed: 0,Date,Name,State,MedianListPrice,Year
0,2018-03-31,New York,NY,503000.0,2018
1,2018-03-31,Los Angeles,CA,721333.0,2018
2,2018-03-31,Chicago,IL,284600.0,2018
3,2018-03-31,Dallas,TX,322997.0,2018
4,2018-03-31,Houston,TX,294467.0,2018


In [243]:
df1.head()

Unnamed: 0,Date,Name,State,MedianSalePrice,Year
0,2008-02-29,United States,,174000.0,2008
1,2008-02-29,New York,NY,400000.0,2008
2,2008-02-29,Los Angeles,CA,470000.0,2008
3,2008-02-29,Chicago,IL,224000.0,2008
4,2008-02-29,Dallas,TX,138000.0,2008


In [244]:
df1.columns

Index(['Date', 'Name', 'State', 'MedianSalePrice', 'Year'], dtype='object')

In [248]:
def merge_zillow_data(df1, df2):
    """
    This function merges the zillow data with other zillow data

    :param df1: The dataframe with the zillow data
    :param df2: The dataframe with the other zillow data
    :return: The merged dataframe
    """
    # Merge df1 and df2 on the 'Date', 'Name' and 'State' columns and call the new dataframe zillow_data_df
    zillow_data_df = pd.merge(df1, df2, on=['Date', 'Name', 'State', 'Year'] , how = 'outer')
    return zillow_data_df

df1 = merge_zillow_data(df1, df4)
df1.head()

Unnamed: 0,Date,Name,State,MedianSalePrice,Year,MedianListPrice
0,2008-02-29,United States,,174000.0,2008,
1,2008-02-29,New York,NY,400000.0,2008,
2,2008-02-29,Los Angeles,CA,470000.0,2008,
3,2008-02-29,Chicago,IL,224000.0,2008,
4,2008-02-29,Dallas,TX,138000.0,2008,


In [249]:
len(df1)

127575

## Data merging

In [247]:
# Now merge
df = pd.merge(df1, df2, on=['Name', 'State', 'Year'])


# Load the interest rate data
df3 = pd.read_csv('sam-resources/MORTGAGE30US.csv')

# Rename the 'DATE' column to 'Date'
df3 = df3.rename(columns={'DATE': 'Date'})


# Merge df and df3 on the 'Date' column
df = pd.merge(df, df3, on='Date')


# Add
df['Unemployment Rate'] = (df['Unemployed Labor Force'] / df['Total Labor Force']) * 100 # Calculate the unemployment rate


ValueError: You are trying to merge on datetime64[ns] and object columns for key 'Date'. If you wish to proceed you should use pd.concat

In [None]:
# Apply the calculate_housing_affordability_index function to the df DataFrame
df['HAI'] = calculate_housing_affordability_index(df['MedianSalePrice'], df['MORTGAGE30US'], df['Median Income'])



In [None]:
df.to_csv('sam-resources/data.csv', index=False)

In [None]:
df

Unnamed: 0,Name,State,Date,MedianSalePrice,Year,Total Population,Total Labor Force,Unemployed Labor Force,Median Income,MORTGAGE30US,Unemployment Rate,HAI
0,New York,NY,2009-04-30,339000.000000,2009,,,,50173.0,4.78,,73.630547
1,Los Angeles,CA,2009-04-30,320000.000000,2009,,,,48570.0,4.78,,75.510228
2,Chicago,IL,2009-04-30,192000.000000,2009,,,,46781.0,4.78,,121.214878
3,Dallas,TX,2009-04-30,140000.000000,2009,,,,41266.0,4.78,,146.639845
4,Houston,TX,2009-04-30,145000.000000,2009,,,,42797.0,4.78,,146.836147
...,...,...,...,...,...,...,...,...,...,...,...,...
16072,Union,SC,2022-06-30,307588.666667,2022,8099.0,3724.0,569.0,32548.0,5.70,15.279270,47.478286
16073,The Dalles,OR,2022-06-30,370924.000000,2022,15988.0,7745.0,545.0,59714.0,5.70,7.036798,72.232439
16074,Greensburg,IN,2022-06-30,199624.750000,2022,11431.0,6011.0,207.0,61864.0,5.70,3.443687,139.047974
16075,Altus,OK,2022-06-30,177258.600000,2022,18711.0,9449.0,396.0,56620.0,5.70,4.190920,143.318945
