In [250]:
import pandas as pd

# Read in the data
df1 = pd.read_csv('sam-resources/Metro_median_sale_price_uc_sfrcondo_month_imputed.csv')
dfog = pd.read_csv('sam-resources/census_data_2009-2022ksada.csv')
df2 = pd.read_csv('sam-resources/median_income_2009-2022.csv')

# Rename the 'RegionName' column to 'Name'
df1 = df1.rename(columns={'RegionName': 'Name'})
#Rename the 'City' column to 'Name'
df2 = df2.rename(columns={'City': 'Name'})
# Remove all rows in df2 that do not contain 'city' in the 'Name' column
df2 = df2[df2['Name'].str.contains('city')]
df2.head()
# Add a column to df2 that contains the state abbreviation
df2['State'] = df2['Name'].str.split(',').str[1]
# Remove 'city' from the values in the 'Name' column of df2
df2['Name'] = df2['Name'].str.split(' city').str[0]
# Remove the state abbreviation from the values in the 'Name' column of df1
df1['Name'] = df1['Name'].str.split(',').str[0]

# Remove all leading and trailing whitespace from the values in the 'Name' column of df1 and df2
df1['Name'] = df1['Name'].str.strip()
df2['Name'] = df2['Name'].str.strip()

# Rename the 'StateName' column in df1 to 'State'
df1 = df1.rename(columns={'StateName': 'State'})

# Remove the leading and trailing whitespace from the values in the 'State' Column of df1 and df2
df1['State'] = df1['State'].str.strip()
df2['State'] = df2['State'].str.strip()

# Filter df2 to only include the rows where the value in the 'Name' column is in the 'Name' column of df1
df2 = df2[df2['Name'].isin(df1['Name'])]

# Convert df1 from wide to long format
df1_long = pd.melt(df1, id_vars=['Name', 'RegionType', 'State', 'RegionID', 'SizeRank'], var_name='Date', value_name='MedianSalePrice')


# Remove the RegionType, RegionID, and SizeRank columns from df1
df1_long = df1_long.drop(columns=['RegionType', 'RegionID', 'SizeRank'])


# Add a column to df1 that contains the year extracted from the 'Date' column
df1_long['Year'] = df1_long['Date'].str.split('-').str[0]


# Convert 'Year' to int64 in both dataframes
df1_long['Year'] = df1_long['Year'].astype('int64')
df2['Year'] = df2['Year'].astype('int64')

# Now merge
df = pd.merge(df1_long, df2, on=['Name', 'State', 'Year'])


# Load the interest rate data
df3 = pd.read_csv('sam-resources/MORTGAGE30US.csv')

# Rename the 'DATE' column to 'Date'
df3 = df3.rename(columns={'DATE': 'Date'})


# Merge df and df3 on the 'Date' column
df = pd.merge(df, df3, on='Date')



In [251]:
def calculate_housing_affordability_index(med_price, interest_rate, med_income):
    """
    Calculate the Housing Affordability Index (HAI)

    :param med_price: Median price of existing single-family home sale
    :param interest_rate: Interest rate (annual)
    :param med_income: Median family income
    :return: Housing Affordability Index
    """
    # Convert annual interest rate to monthly and calculate the monthly payment (PMT)
    monthly_interest_rate = interest_rate / 12 / 100
    pmt = med_price * 0.8 * (monthly_interest_rate) / (1 - (1 / ((1 + monthly_interest_rate)**360)))

    # Calculate Qualifying Income (QINC)
    qualifying_income = pmt * 4 * 12

    # Calculate Housing Affordability Index
    hai = (med_income / qualifying_income) * 100

    return hai

# Example usage
example_med_price = 300000  # Example median price
example_interest_rate = 0.04  # Example interest rate (4%)
example_med_income = 60000  # Example median income

housing_affordability_index = calculate_housing_affordability_index(example_med_price, example_interest_rate, example_med_income)
print(housing_affordability_index)


186.37639887651713


In [252]:
# Apply the calculate_housing_affordability_index function to the df DataFrame
df['HAI'] = calculate_housing_affordability_index(df['MedianSalePrice'], df['MORTGAGE30US'], df['Median Income'])
df.head()


Unnamed: 0,Name,State,Date,MedianSalePrice,Year,Median Income,MORTGAGE30US,HAI
0,New York,NY,2009-04-30,339000.0,2009,50173.0,4.78,73.630547
1,Los Angeles,CA,2009-04-30,320000.0,2009,48570.0,4.78,75.510228
2,Chicago,IL,2009-04-30,192000.0,2009,46781.0,4.78,121.214878
3,Dallas,TX,2009-04-30,140000.0,2009,41266.0,4.78,146.639845
4,Houston,TX,2009-04-30,145000.0,2009,42797.0,4.78,146.836147


In [253]:
df.to_csv('sam-resources/data.csv', index=False)

In [254]:
dfog.head()

Unnamed: 0,City,Total Population,Total Labor Force,Unemployed Labor Force,Median Income,Year
0,"Pine Flat CDP, CA",114,22.0,4.0,15724.0,2011
1,"Pine Grove CDP, CA",2573,1098.0,64.0,52917.0,2011
2,"Pine Mountain Lake CDP, CA",2695,942.0,100.0,54200.0,2011
3,"Piñon Hills CDP, CA",6130,2510.0,503.0,38140.0,2011
4,"Pioneer CDP, CA",1226,621.0,109.0,42917.0,2011
