In [1]:
import pandas as pd
import datetime

pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 80)

In [2]:
# Read in the data
df_original = pd.read_csv('data/redfin_county_data_2023-06.tsv', sep='\t')
df_original.shape

(42386, 11)

In [3]:
df = df_original.copy()
df

Unnamed: 0,month,county_name,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated
0,2022-01-01,Anchorage Borough,AK,All Residential,299.0,261.0,282.0,23.0,0.9,0.997268,2023-05-14 15:07:54
1,2022-02-01,Anchorage Borough,AK,All Residential,275.0,376.0,260.0,26.0,0.9,0.999285,2023-05-14 15:07:54
2,2022-03-01,Anchorage Borough,AK,All Residential,331.0,520.0,339.0,10.0,1.0,1.007284,2023-05-14 15:07:54
3,2022-04-01,Anchorage Borough,AK,All Residential,382.0,588.0,412.0,7.0,1.1,1.016928,2023-05-14 15:07:54
4,2022-05-01,Anchorage Borough,AK,All Residential,451.0,581.0,490.0,6.0,1.1,1.021409,2023-05-14 15:07:54
...,...,...,...,...,...,...,...,...,...,...,...
42381,2022-12-01,Morgan County,WV,All Residential,13.0,17.0,81.0,38.0,6.2,0.975503,2023-05-14 15:07:54
42382,2023-01-01,Morgan County,WV,All Residential,22.0,19.0,69.0,129.0,3.1,0.942904,2023-05-14 15:07:54
42383,2023-02-01,Morgan County,WV,All Residential,19.0,32.0,74.0,69.0,3.9,0.945075,2023-05-14 15:07:54
42384,2023-03-01,Morgan County,WV,All Residential,26.0,24.0,75.0,48.0,2.9,0.983744,2023-05-14 15:07:54


In [4]:
df["county_qualified_name"] = df["county_name"] + ', ' + df["state_code"]

df['month'] = pd.to_datetime(df["month"])

df.sort_values(by=['state_code', 'county_name'], inplace=True)
df


Unnamed: 0,month,county_name,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated,county_qualified_name
0,2022-01-01,Anchorage Borough,AK,All Residential,299.0,261.0,282.0,23.0,0.9,0.997268,2023-05-14 15:07:54,"Anchorage Borough, AK"
1,2022-02-01,Anchorage Borough,AK,All Residential,275.0,376.0,260.0,26.0,0.9,0.999285,2023-05-14 15:07:54,"Anchorage Borough, AK"
2,2022-03-01,Anchorage Borough,AK,All Residential,331.0,520.0,339.0,10.0,1.0,1.007284,2023-05-14 15:07:54,"Anchorage Borough, AK"
3,2022-04-01,Anchorage Borough,AK,All Residential,382.0,588.0,412.0,7.0,1.1,1.016928,2023-05-14 15:07:54,"Anchorage Borough, AK"
4,2022-05-01,Anchorage Borough,AK,All Residential,451.0,581.0,490.0,6.0,1.1,1.021409,2023-05-14 15:07:54,"Anchorage Borough, AK"
...,...,...,...,...,...,...,...,...,...,...,...,...
42381,2022-12-01,Morgan County,WV,All Residential,13.0,17.0,81.0,38.0,6.2,0.975503,2023-05-14 15:07:54,"Morgan County, WV"
42382,2023-01-01,Morgan County,WV,All Residential,22.0,19.0,69.0,129.0,3.1,0.942904,2023-05-14 15:07:54,"Morgan County, WV"
42383,2023-02-01,Morgan County,WV,All Residential,19.0,32.0,74.0,69.0,3.9,0.945075,2023-05-14 15:07:54,"Morgan County, WV"
42384,2023-03-01,Morgan County,WV,All Residential,26.0,24.0,75.0,48.0,2.9,0.983744,2023-05-14 15:07:54,"Morgan County, WV"


In [5]:
start_date = pd.to_datetime('2023-03-01')
one_month_ago = start_date - pd.DateOffset(months=1)
three_months_ago = start_date - pd.DateOffset(months=3)
six_months_ago = start_date - pd.DateOffset(months=6)
twelve_months_ago = start_date - pd.DateOffset(years=1)
eighteen_months_ago = start_date - pd.DateOffset(years=1, months=6)

time_periods = {
                1: one_month_ago,
                3: three_months_ago,
                6: six_months_ago,
                12: twelve_months_ago,
                18: eighteen_months_ago
              }

time_periods

{1: Timestamp('2023-02-01 00:00:00'),
 3: Timestamp('2022-12-01 00:00:00'),
 6: Timestamp('2022-09-01 00:00:00'),
 12: Timestamp('2022-03-01 00:00:00'),
 18: Timestamp('2021-09-01 00:00:00')}

In [6]:
# Create an empty DataFrame to store the averages

averages_df = pd.DataFrame()

for period, time in time_periods.items():
    df_period = df[df['month'] > time]
    averages = df_period.groupby('county_qualified_name').agg({'homes_sold': 'mean', 
                                                    'new_listings': 'mean', 
                                                    'inventory': 'mean', 
                                                    'median_dom': 'mean',
                                                    'months_of_supply': 'mean',
                                                    'state_code': 'first',
                                                    'county_name': 'first'}).reset_index()
    # Round the values to 2 decimal places
    averaged_column_names = ['homes_sold', 'new_listings', 'inventory', 'median_dom', 'months_of_supply']
    averages[averaged_column_names] = averages[averaged_column_names].round(2)

    averages['months_averaged'] = period
    averages_df = pd.concat([averages_df, averages])

# Reset the index of the new DataFrame
averages_df.reset_index(drop=True, inplace=True)

averages_df

Unnamed: 0,county_qualified_name,homes_sold,new_listings,inventory,median_dom,months_of_supply,state_code,county_name,months_averaged
0,"Abbeville County, SC",17.00,15.50,36.50,89.00,2.15,SC,Abbeville County,1
1,"Acadia Parish, LA",22.00,19.00,63.00,69.00,3.00,LA,Acadia Parish,1
2,"Accomack County, VA",30.50,51.50,121.50,44.50,4.10,VA,Accomack County,1
3,"Ada County, ID",730.50,645.00,1253.50,52.50,1.75,ID,Ada County,1
4,"Adair County, IA",4.50,5.00,7.50,37.00,1.70,IA,Adair County,1
...,...,...,...,...,...,...,...,...,...
14292,"Yukon-Koyukuk Census Area, AK",1.40,2.00,2.20,193.80,1.80,AK,Yukon-Koyukuk Census Area,18
14293,"Yuma County, AZ",190.69,212.44,542.06,55.19,2.94,AZ,Yuma County,18
14294,"Yuma County, CO",3.93,4.79,11.57,46.79,3.52,CO,Yuma County,18
14295,"Zapata County, TX",2.25,3.36,18.83,158.00,11.51,TX,Zapata County,18


In [7]:
df_sorted = averages_df.copy()

In [8]:
df_sorted.sort_values(by=['state_code', 'county_name', 'months_averaged'], inplace=True)
df_sorted.reset_index(drop=True, inplace=True)

df_sorted.head(6)

Unnamed: 0,county_qualified_name,homes_sold,new_listings,inventory,median_dom,months_of_supply,state_code,county_name,months_averaged
0,"Anchorage Borough, AK",225.0,328.0,243.5,10.5,1.1,AK,Anchorage Borough,1
1,"Anchorage Borough, AK",190.0,269.25,234.0,21.0,1.27,AK,Anchorage Borough,3
2,"Anchorage Borough, AK",235.57,232.57,274.43,19.57,1.2,AK,Anchorage Borough,6
3,"Anchorage Borough, AK",328.62,348.23,380.08,15.08,1.18,AK,Anchorage Borough,12
4,"Anchorage Borough, AK",323.56,355.25,363.88,15.94,1.13,AK,Anchorage Borough,18
5,"Bethel Census Area, AK",1.0,1.0,7.0,59.0,7.0,AK,Bethel Census Area,1


In [9]:
today = datetime.date.today()
date_string = today.strftime("%Y-%m")
file_path = f"processed/redfin_month_rollup_{date_string}.tsv"

In [10]:
df_sorted.to_csv(file_path, index=False, sep ='\t')