In [49]:
import pandas as pd

# Read in the data
df1 = pd.read_csv('sam-resources/Metro_median_sale_price_uc_sfrcondo_month_imputed.csv')
dfog = pd.read_csv('sam-resources/census_data_2009-2022ksada.csv')
df2 = pd.read_csv('sam-resources/median_income_2009-2022.csv')
df4 = pd.read_csv('sam-resources/Median_list_Price_imputed.csv')
df5 = pd.read_csv('sam-resources/ExistingHomeSales.csv')



FileNotFoundError: [Errno 2] No such file or directory: 'sam-resources/ExistingHomeSales.csv'

In [None]:
def merge_census_data(df1, df2):
    """
    This function merges the census data with the median income data

    :param df1: The dataframe with the census data
    :param df2: The dataframe with the median income data
    :return: The merged dataframe
    """
    # Merge dfog and df2 on the 'City' and 'Year' columns and call the new dataframe census_data_df
    census_data_df = pd.merge(df1, df2, on=['City', 'Year', 'Median Income'], how='outer')
    # # Fill NaN values with the median income from the other column
    # census_data_df['Median Income'] = census_data_df['Median Income_x'].fillna(census_data_df['Median Income_y'])
    # Drop the columns that are not needed
    # census_data_df = census_data_df.drop(columns=['Median Income_x', 'Median Income_y'])
    return census_data_df

df2 = merge_census_data(dfog, df2)
df2.head()

Unnamed: 0,City,Total Population,Total Labor Force,Unemployed Labor Force,Median Income,Year
0,"Pine Flat CDP, CA",114.0,22.0,4.0,15724.0,2011
1,"Pine Grove CDP, CA",2573.0,1098.0,64.0,52917.0,2011
2,"Pine Mountain Lake CDP, CA",2695.0,942.0,100.0,54200.0,2011
3,"Piñon Hills CDP, CA",6130.0,2510.0,503.0,38140.0,2011
4,"Pioneer CDP, CA",1226.0,621.0,109.0,42917.0,2011


In [None]:
def calculate_housing_affordability_index(med_price, interest_rate, med_income):
    """
    Calculate the Housing Affordability Index (HAI)

    :param med_price: Median price of existing single-family home sale
    :param interest_rate: Interest rate (annual)
    :param med_income: Median family income
    :return: Housing Affordability Index
    """
    # Convert annual interest rate to monthly and calculate the monthly payment (PMT)
    monthly_interest_rate = interest_rate / 12 / 100
    pmt = med_price * 0.8 * (monthly_interest_rate) / (1 - (1 / ((1 + monthly_interest_rate)**360)))

    # Calculate Qualifying Income (QINC)
    qualifying_income = pmt * 4 * 12

    # Calculate Housing Affordability Index
    hai = (med_income / qualifying_income) * 100

    return hai

# Example usage
example_med_price = 300000  # Example median price
example_interest_rate = 0.04  # Example interest rate (4%)
example_med_income = 60000  # Example median income

housing_affordability_index = calculate_housing_affordability_index(example_med_price, example_interest_rate, example_med_income)
print(housing_affordability_index)


186.37639887651713


# Data Cleaning

In [None]:
def clean_zillow_data(zdf, value_name='MedianSalePrice'):
    """
    Clean the Zillow data

    :param zdf: Zillow dataframe
    :return: Cleaned Zillow dataframe
    """
    # Rename the 'RegionName' column to 'Name'
    zdf = zdf.rename(columns={'RegionName': 'Name'})
    # Remove the state abbreviation from the values in the 'Name' column of zdf
    zdf['Name'] = zdf['Name'].str.split(',').str[0]
    # Rename the 'StateName' column in zdf to 'State'
    zdf = zdf.rename(columns={'StateName': 'State'})
    # Get a list of the column names in zdf that do not contain '-' or '/'
    id_vars = [col for col in zdf.columns if ('-' not in col) and ('/' not in col)]
    # Convert zdf from wide to long format
    zdf = pd.melt(zdf, id_vars = id_vars, var_name='Date', value_name = value_name)
    cols = ['Date', 'Name', 'State', value_name]
    # Keep only the columns specified in cols
    zdf = zdf[cols]
    # Add a column to zdf that contains the year extracted from the 'Date' column
    if '-' in zdf['Date'][0]:
        zdf['Year'] = zdf['Date'].str.split('-').str[0]
    elif '/' in zdf['Date'][0]:
        zdf['Year'] = zdf['Date'].str.split('/').str[-1]
    # Remove the leading and trailing whitespace from the values in the 'State' Column of zdf
    zdf['State'] = zdf['State'].str.strip()
    # Remove all leading and trailing whitespace from the values in the 'Name' column of zdf
    zdf['Name'] = zdf['Name'].str.strip()
    # Convert 'Date' to datetime in zdf
    zdf['Date'] = pd.to_datetime(zdf['Date'])
    # Convert 'Year' to int64 in zdf
    zdf['Year'] = zdf['Year'].astype('int64')
    # Return the cleaned zillow dataframe
    return zdf

df1 = clean_zillow_data(df1, 'MedianSalePrice')
df4 = clean_zillow_data(df4, 'MedianListPrice')
df5 = clean_zillow_data(df5, 'SalesTotal')
df1.head()

Unnamed: 0,Date,Name,State,MedianSalePrice,Year
0,2008-02-29,United States,,174000.0,2008
1,2008-02-29,New York,NY,400000.0,2008
2,2008-02-29,Los Angeles,CA,470000.0,2008
3,2008-02-29,Chicago,IL,224000.0,2008
4,2008-02-29,Dallas,TX,138000.0,2008


In [None]:
def clean_census_data(cdf, zdf):
    """
    Clean the Census data

    :param cdf: Census dataframe
    :param zdf: Zillow dataframe
    :return: Cleaned Census dataframe
    """
    # Add a column to cdf that contains the state abbreviation extracted from the 'City' column
    cdf['State'] = cdf['City'].str.split(',').str[1]
    # Rename the 'City' column to 'Name'
    cdf = cdf.rename(columns={'City': 'Name'})
    # Remove the state abbreviation from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.split(',').str[0]
    # Remove all rows in cdf that do not contain 'city' in the 'Name' column
    cdf = cdf[cdf['Name'].str.contains('city')]
    # Remove 'city' from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.split('city').str[0]
    # Remove all leading and trailing whitespace from the values in the 'Name' column of cdf
    cdf['Name'] = cdf['Name'].str.strip()
    # Remove the leading and trailing whitespace from the values in the 'State' column of cdf
    cdf['State'] = cdf['State'].str.strip()
    # Convert 'Year' to int64 in cdf
    cdf['Year'] = cdf['Year'].astype('int64')
    # Filter cdf to only include rows where the 'Name' column is in the 'Name' column of zdf
    cdf = cdf[cdf['Name'].isin(zdf['Name'])]
    # Return the cleaned census dataframe
    return cdf



df2 = clean_census_data(df2, df1)
df2.head()

Unnamed: 0,Name,Total Population,Total Labor Force,Unemployed Labor Force,Median Income,Year,State
233,Barnstable Town,45486.0,24276.0,1662.0,62191.0,2011,MA
235,Little Rock,461.0,241.0,11.0,41250.0,2011,IA
252,Hartford,669.0,319.0,12.0,49453.0,2011,IA
290,Hammond,19926.0,10069.0,893.0,33544.0,2011,LA
297,Baton Rouge,229169.0,118979.0,10975.0,37381.0,2011,LA


In [None]:
df5.head()

Unnamed: 0,Date,Name,State,SalesTotal,Year
0,2008-02-29,United States,,205206.0,2008
1,2008-02-29,New York,NY,8591.0,2008
2,2008-02-29,Los Angeles,CA,4159.0,2008
3,2008-02-29,Chicago,IL,5933.0,2008
4,2008-02-29,Dallas,TX,5058.0,2008


In [None]:
def merge_zillow_data(df1, df2):
    """
    This function merges the zillow data with other zillow data

    :param df1: The dataframe with the zillow data
    :param df2: The dataframe with the other zillow data
    :return: The merged dataframe
    """
    # Merge df1 and df2 on the 'Date', 'Name' and 'State' columns and call the new dataframe zillow_data_df
    zillow_data_df = pd.merge(df1, df2, on=['Date', 'Name', 'State', 'Year'] , how = 'outer')
    return zillow_data_df

df1 = merge_zillow_data(df1, df4)
df1 = merge_zillow_data(df1, df5)
df1.head()

Unnamed: 0,Date,Name,State,MedianSalePrice,Year,MedianListPrice,SalesTotal
0,2008-02-29,United States,,174000.0,2008,,205206.0
1,2008-02-29,New York,NY,400000.0,2008,,8591.0
2,2008-02-29,Los Angeles,CA,470000.0,2008,,4159.0
3,2008-02-29,Chicago,IL,224000.0,2008,,5933.0
4,2008-02-29,Dallas,TX,138000.0,2008,,5058.0


## Data merging

In [None]:
# Now merge
df = pd.merge(df1, df2, on=['Name', 'State', 'Year'], how='outer')


# Load the interest rate data
df3 = pd.read_csv('sam-resources/MORTGAGE30US.csv')

# Rename the 'DATE' column to 'Date'
df3 = df3.rename(columns={'DATE': 'Date'})

# Convert 'Date' to datetime in df3
df3['Date'] = pd.to_datetime(df3['Date'])



# Merge df and df3 on the 'Date' column
df = pd.merge(df, df3, on='Date')


# Add
df['Unemployment Rate'] = (df['Unemployed Labor Force'] / df['Total Labor Force']) * 100 # Calculate the unemployment rate


In [None]:
# Apply the calculate_housing_affordability_index function to the df DataFrame
df['HAI'] = calculate_housing_affordability_index(df['MedianSalePrice'], df['MORTGAGE30US'], df['Median Income'])

In [None]:
df.to_csv('sam-resources/data.csv', index=False)

In [None]:
# Rename the 'Name' column to 'CityName'
df = df.rename(columns={'Name': 'CityName'})
# Rename the 'State' column to 'StateName' 
df = df.rename(columns={'State': 'StateName'})
df.head()

Unnamed: 0,Date,CityName,StateName,MedianSalePrice,Year,MedianListPrice,SalesTotal,Total Population,Total Labor Force,Unemployed Labor Force,Median Income,MORTGAGE30US,Unemployment Rate,HAI
0,2008-07-31,United States,,185000.0,2008,,308096.0,,,,,6.52,,
1,2008-07-31,New York,NY,410000.0,2008,,13753.0,,,,,6.52,,
2,2008-07-31,Los Angeles,CA,420000.0,2008,,8253.0,,,,,6.52,,
3,2008-07-31,Chicago,IL,242000.0,2008,,9407.0,,,,,6.52,,
4,2008-07-31,Dallas,TX,154500.0,2008,,6813.0,,,,,6.52,,
