In [172]:
import pandas as pd
import datetime

pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 80)

In [173]:
# Read in the data
df_original = pd.read_csv('data/redfin_county_data_2023-05.tsv', sep='\t')
df_original.shape

(39437, 11)

In [174]:
df = df_original.copy()
df

Unnamed: 0,month,county_name,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated
0,2022-01-01,Anchorage Borough,AK,All Residential,299.0,261.0,282.0,23.0,0.9,0.997268,2023-04-09 14:55:08
1,2022-02-01,Anchorage Borough,AK,All Residential,275.0,376.0,260.0,26.0,0.9,0.999285,2023-04-09 14:55:08
2,2022-03-01,Anchorage Borough,AK,All Residential,331.0,520.0,339.0,10.0,1.0,1.007284,2023-04-09 14:55:08
3,2022-04-01,Anchorage Borough,AK,All Residential,382.0,588.0,412.0,7.0,1.1,1.016928,2023-04-09 14:55:08
4,2022-05-01,Anchorage Borough,AK,All Residential,451.0,581.0,490.0,6.0,1.1,1.021409,2023-04-09 14:55:08
...,...,...,...,...,...,...,...,...,...,...,...
39432,2022-11-01,Morgan County,WV,All Residential,32.0,21.0,85.0,51.0,2.7,0.967970,2023-04-09 14:55:08
39433,2022-12-01,Morgan County,WV,All Residential,13.0,18.0,82.0,38.0,6.3,0.975503,2023-04-09 14:55:08
39434,2023-01-01,Morgan County,WV,All Residential,22.0,19.0,70.0,129.0,3.2,0.942904,2023-04-09 14:55:08
39435,2023-02-01,Morgan County,WV,All Residential,19.0,32.0,75.0,69.0,3.9,0.945075,2023-04-09 14:55:08


In [175]:
df["county_qualified_name"] = df["county_name"] + ', ' + df["state_code"]

df['month'] = pd.to_datetime(df["month"])

df.sort_values(by=['state_code', 'county_name'], inplace=True)
df


Unnamed: 0,month,county_name,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated,county_qualified_name
0,2022-01-01,Anchorage Borough,AK,All Residential,299.0,261.0,282.0,23.0,0.9,0.997268,2023-04-09 14:55:08,"Anchorage Borough, AK"
1,2022-02-01,Anchorage Borough,AK,All Residential,275.0,376.0,260.0,26.0,0.9,0.999285,2023-04-09 14:55:08,"Anchorage Borough, AK"
2,2022-03-01,Anchorage Borough,AK,All Residential,331.0,520.0,339.0,10.0,1.0,1.007284,2023-04-09 14:55:08,"Anchorage Borough, AK"
3,2022-04-01,Anchorage Borough,AK,All Residential,382.0,588.0,412.0,7.0,1.1,1.016928,2023-04-09 14:55:08,"Anchorage Borough, AK"
4,2022-05-01,Anchorage Borough,AK,All Residential,451.0,581.0,490.0,6.0,1.1,1.021409,2023-04-09 14:55:08,"Anchorage Borough, AK"
...,...,...,...,...,...,...,...,...,...,...,...,...
39432,2022-11-01,Morgan County,WV,All Residential,32.0,21.0,85.0,51.0,2.7,0.967970,2023-04-09 14:55:08,"Morgan County, WV"
39433,2022-12-01,Morgan County,WV,All Residential,13.0,18.0,82.0,38.0,6.3,0.975503,2023-04-09 14:55:08,"Morgan County, WV"
39434,2023-01-01,Morgan County,WV,All Residential,22.0,19.0,70.0,129.0,3.2,0.942904,2023-04-09 14:55:08,"Morgan County, WV"
39435,2023-02-01,Morgan County,WV,All Residential,19.0,32.0,75.0,69.0,3.9,0.945075,2023-04-09 14:55:08,"Morgan County, WV"


In [176]:
start_date = pd.to_datetime('2023-03-01')
one_month_ago = start_date - pd.DateOffset(months=1)
three_months_ago = start_date - pd.DateOffset(months=3)
six_months_ago = start_date - pd.DateOffset(months=6)
twelve_months_ago = start_date - pd.DateOffset(years=1)
eighteen_months_ago = start_date - pd.DateOffset(years=1, months=6)

time_periods = {
                1: one_month_ago,
                3: three_months_ago,
                6: six_months_ago,
                12: twelve_months_ago,
                18: eighteen_months_ago
              }

time_periods

{1: Timestamp('2023-02-01 00:00:00'),
 3: Timestamp('2022-12-01 00:00:00'),
 6: Timestamp('2022-09-01 00:00:00'),
 12: Timestamp('2022-03-01 00:00:00'),
 18: Timestamp('2021-09-01 00:00:00')}

In [177]:
# Create an empty DataFrame to store the averages

averages_df = pd.DataFrame()

for period, time in time_periods.items():
    df_period = df[df['month'] > time]
    averages = df_period.groupby('county_qualified_name').agg({'homes_sold': 'mean', 
                                                    'new_listings': 'mean', 
                                                    'inventory': 'mean', 
                                                    'median_dom': 'mean',
                                                    'months_of_supply': 'mean',
                                                    'state_code': 'first',
                                                    'county_name': 'first'}).reset_index()
    # Round the values to 2 decimal places
    averaged_column_names = ['homes_sold', 'new_listings', 'inventory', 'median_dom', 'months_of_supply']
    averages[averaged_column_names] = averages[averaged_column_names].round(2)

    averages['months_averaged'] = period
    averages_df = pd.concat([averages_df, averages])

# Reset the index of the new DataFrame
averages_df.reset_index(drop=True, inplace=True)

averages_df

Unnamed: 0,county_qualified_name,homes_sold,new_listings,inventory,median_dom,months_of_supply,state_code,county_name,months_averaged
0,"Abbeville County, SC",20.00,17.00,39.00,117.00,2.00,SC,Abbeville County,1
1,"Acadia Parish, LA",25.00,18.00,73.00,33.00,2.90,LA,Acadia Parish,1
2,"Accomack County, VA",36.00,56.00,124.00,60.00,3.40,VA,Accomack County,1
3,"Ada County, ID",767.00,543.00,1178.00,71.00,1.50,ID,Ada County,1
4,"Adair County, IA",5.00,4.00,7.00,20.00,1.40,IA,Adair County,1
...,...,...,...,...,...,...,...,...,...
14102,"Yuba County, CA",86.27,105.27,192.33,27.80,2.45,CA,Yuba County,18
14103,"Yuma County, AZ",191.67,214.67,550.00,55.20,2.96,AZ,Yuma County,18
14104,"Yuma County, CO",3.77,4.77,11.69,41.92,3.66,CO,Yuma County,18
14105,"Zapata County, TX",2.25,3.36,18.75,158.00,11.47,TX,Zapata County,18


In [178]:
df_sorted = averages_df.copy()

In [179]:
df_sorted.sort_values(by=['state_code', 'county_name', 'months_averaged'], inplace=True)
df_sorted.reset_index(drop=True, inplace=True)

df_sorted.head(6)

Unnamed: 0,county_qualified_name,homes_sold,new_listings,inventory,median_dom,months_of_supply,state_code,county_name,months_averaged
0,"Anchorage Borough, AK",233.0,330.0,232.0,13.0,1.0,AK,Anchorage Borough,1
1,"Anchorage Borough, AK",180.67,249.33,225.0,25.33,1.3,AK,Anchorage Borough,3
2,"Anchorage Borough, AK",238.5,216.5,277.5,21.5,1.2,AK,Anchorage Borough,6
3,"Anchorage Borough, AK",337.83,349.75,390.42,15.67,1.18,AK,Anchorage Borough,12
4,"Anchorage Borough, AK",330.6,356.93,371.07,16.47,1.13,AK,Anchorage Borough,18
5,"Fairbanks North Star Borough, AK",15.0,128.0,150.0,85.0,10.0,AK,Fairbanks North Star Borough,1


In [180]:
today = datetime.date.today()
date_string = today.strftime("%Y-%m")
file_path = f"processed/redfin_month_rollup_{date_string}.tsv"

In [181]:
df_sorted.to_csv(file_path, index=False, sep ='\t')