In [1]:
import pandas as pd

In [2]:
# Get the FRED mortgage data to be merged with the Zillow data
fred_csv_path = '../Data/FRED/clean_mortgage_data_month.csv'
fred_df = pd.read_csv(fred_csv_path)

In [3]:
# Set up all Zillow files and resulting column names
base_read_path = '../Data/Zillow/clean/'
zillow_file_name_to_col_name_map: dict[str, str] = {
    'city_market_temp_index_month.csv': 'MarketTempIdx',
    'city_mean_days_to_close_month.csv': 'DaysToClose',
    'city_mean_doz_pending_month.csv': 'DaysOnZillow',
    'city_mean_listings_price_cut_amt_month.csv': 'ListingPriceCutAmt',
    'city_perc_listings_price_cut_month.csv': 'ListingPriceCutPct',
    'city_new_con_mean_sale_price_month.csv': 'NewConstructMeanSalePrice',
    'city_new_con_sales_count_raw_month.csv': 'NewConstructSalesCount',
    'city_sales_count_now_month.csv': 'SalesCount',
    'city_sales_price_month.csv': 'SalePrice'
}

In [4]:
# Read all Zillow files into dataframes and store in a dictionary
df_dict: dict[str, pd.DataFrame] = {}
for file_name, col_name in zillow_file_name_to_col_name_map.items():
    df = pd.read_csv(base_read_path + file_name)
    df_dict[col_name] = df

In [5]:
# NOTE: Buffalo NY doen't have data in all files so usineg CHARLOTTE instead.
# Limit our scope of cities to Denver, Buffalo, Cincinnati, Columbus, Indianapolis, and Providence
selected_city_names = ['DENVER', 'CHARLOTTE', 'CINCINNATI', 'ATLANTA', 'INDIANAPOLIS', 'PROVIDENCE']
city_df_dict: dict[str, list[pd.DataFrame]] = {}
for city_name in selected_city_names:
    city_df_dict[city_name] = []
    for col_name, df in df_dict.items():
        # print(city_name, col_name)
        temp_df = df.loc[df['CityName'] == city_name]
        temp_df = temp_df.drop(columns=['CityName', 'StateName'])
        temp_df = temp_df.transpose()
        temp_df.reset_index(inplace=True)
        temp_df.columns = ['DATE', col_name]
        city_df_dict[city_name].append(temp_df)

In [6]:
# Now merge all of the Zillow dataframes for each city into one dataframe with appropriate columns
city_merged_df_dict: dict[str, pd.DataFrame] = {}
for city_name in selected_city_names:
    merged_df = city_df_dict[city_name][0]
    for i in range(1, len(city_df_dict[city_name])):
        merged_df = pd.merge(merged_df, city_df_dict[city_name][i], on=['DATE'])
    # Add the FRED mortgage data
    merged_df = pd.merge(merged_df, fred_df, on=['DATE'])
    city_merged_df_dict[city_name] = merged_df    

In [7]:
# Sanity check for Denver
df = city_merged_df_dict['DENVER']
df


Unnamed: 0,DATE,MarketTempIdx,DaysToClose,DaysOnZillow,ListingPriceCutAmt,ListingPriceCutPct,NewConstructMeanSalePrice,NewConstructSalesCount,SalesCount,SalePrice,MORTGAGE15US,MORTGAGE30US,RATE_DIFFERENCE
0,2018-01,74.0,34.0,28.0,16733.849533,0.133225,537706.730205,682.0,3169.0,397708.149372,3.482,4.033,0.550
1,2018-02,72.0,31.0,22.0,16423.930620,0.130210,545468.537291,657.0,3395.0,400872.926042,3.785,4.330,0.545
2,2018-03,71.0,30.0,19.0,16012.326935,0.153830,549485.507056,992.0,4766.0,403904.017509,3.910,4.444,0.534
3,2018-04,70.0,31.0,16.0,16674.160324,0.178077,555329.230501,859.0,5045.0,406571.265108,3.925,4.468,0.543
4,2018-05,68.0,32.0,17.0,15617.310131,0.220831,543572.326351,907.0,5812.0,409135.348940,4.066,4.586,0.520
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,2024-02,74.0,28.0,35.0,17228.124175,0.215376,749677.646739,368.0,2767.0,588450.438544,6.102,6.776,0.674
74,2024-03,72.0,28.0,26.0,18829.755065,0.240464,745504.104784,439.0,3327.0,588917.680727,6.175,6.820,0.645
75,2024-04,68.0,28.0,21.0,19466.333021,0.287527,841958.089514,391.0,3660.0,590109.794947,6.263,6.992,0.730
76,2024-05,62.0,28.0,23.0,18533.910679,0.328747,813332.596257,374.0,4050.0,590809.175996,6.346,7.060,0.714


In [8]:
# Save merged dataframes to csv
for city_name, df in city_merged_df_dict.items():
    df.to_csv(f'../Data/Clean/{city_name.capitalize()}.csv', index=False)
