In [2]:
import pandas as pd
import datetime

pd.set_option('display.max_columns', 85)

In [3]:
file_path = '/Users/jibrank/Library/Mobile Documents/com~apple~CloudDocs/Land_Archive/Archive/redfin_county_market_tracker.tsv'

In [4]:
df_original = pd.read_csv(file_path, sep='\t')
df_original.shape

(891447, 58)

In [5]:
df = df_original.copy()

In [6]:

# Convert 'period_begin' to datetime if it's not already
df['period_begin'] = pd.to_datetime(df['period_begin'])
df = df[df['property_type'] == 'All Residential']

# Filter to include only rows where 'period_begin' is in 2022 or after
df_2022_and_after = df[df['period_begin'] >= '2022-01-01']
df_2022_and_after.shape

(45275, 58)

In [7]:
df = df_2022_and_after.loc[:, ['period_begin', 'region', 'state', 'state_code', 'property_type', 'homes_sold', 'new_listings', 'inventory', 'median_dom', 'months_of_supply', 'avg_sale_to_list', 'last_updated']]
df = df.sort_values(by='period_begin', ascending=False)
df = df.reset_index(drop=True)
df = df.rename(columns={'region':'county', 'period_begin': 'month'})
df['county_name'] = df['county'].apply(lambda x: x.split(',')[0] if ',' in x else x)
df

Unnamed: 0,month,county,state,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated,county_name
0,2023-05-01,"Van Wert County, OH",Ohio,OH,All Residential,23.0,20.0,29.0,24.0,1.3,0.987674,2023-06-12 18:59:53,Van Wert County
1,2023-05-01,"Rockland County, NY",New York,NY,All Residential,188.0,289.0,419.0,22.0,2.2,1.010057,2023-06-12 18:59:53,Rockland County
2,2023-05-01,"Richland County, OH",Ohio,OH,All Residential,84.0,129.0,102.0,8.0,1.2,1.002957,2023-06-12 18:59:53,Richland County
3,2023-05-01,"Windsor County, VT",Vermont,VT,All Residential,59.0,77.0,195.0,50.0,3.3,0.987579,2023-06-12 18:59:53,Windsor County
4,2023-05-01,"Geneva County, AL",Alabama,AL,All Residential,11.0,19.0,37.0,67.0,3.4,0.942850,2023-06-12 18:59:53,Geneva County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45270,2022-01-01,"Falls Church, VA",Virginia,VA,All Residential,5.0,5.0,10.0,32.0,2.0,1.012100,2023-06-12 18:59:53,Falls Church
45271,2022-01-01,"Grundy County, IA",Iowa,IA,All Residential,7.0,7.0,20.0,81.0,2.9,0.943819,2023-06-12 18:59:53,Grundy County
45272,2022-01-01,"Weld County, CO",Colorado,CO,All Residential,472.0,527.0,757.0,35.0,1.6,1.009671,2023-06-12 18:59:53,Weld County
45273,2022-01-01,"Cedar County, MO",Missouri,MO,All Residential,11.0,14.0,38.0,58.0,3.5,0.926985,2023-06-12 18:59:53,Cedar County


In [8]:
# Drop the 'county' and 'state' columns
df = df.drop(columns=['county', 'state'])

# Rearrange the columns
df = df[['month', 'county_name', 'state_code', 'property_type', 'homes_sold', 'new_listings', 'inventory', 
         'median_dom', 'months_of_supply', 'avg_sale_to_list', 'last_updated']]

df = df.sort_values(by=['state_code', 'county_name', 'month'])
df

Unnamed: 0,month,county_name,state_code,property_type,homes_sold,new_listings,inventory,median_dom,months_of_supply,avg_sale_to_list,last_updated
42905,2022-01-01,Anchorage Borough,AK,All Residential,299.0,260.0,281.0,23.0,0.9,0.997268,2023-06-12 18:59:53
40932,2022-02-01,Anchorage Borough,AK,All Residential,275.0,377.0,261.0,26.0,0.9,0.999285,2023-06-12 18:59:53
39462,2022-03-01,Anchorage Borough,AK,All Residential,330.0,519.0,338.0,10.0,1.0,1.007306,2023-06-12 18:59:53
35044,2022-04-01,Anchorage Borough,AK,All Residential,382.0,588.0,412.0,7.0,1.1,1.016928,2023-06-12 18:59:53
32661,2022-05-01,Anchorage Borough,AK,All Residential,450.0,580.0,490.0,6.0,1.1,1.021457,2023-06-12 18:59:53
...,...,...,...,...,...,...,...,...,...,...,...
11979,2023-01-01,Morgan County,WV,All Residential,22.0,19.0,70.0,129.0,3.2,0.942904,2023-06-12 18:59:53
10583,2023-02-01,Morgan County,WV,All Residential,19.0,31.0,75.0,69.0,3.9,0.945075,2023-06-12 18:59:53
6604,2023-03-01,Morgan County,WV,All Residential,25.0,24.0,75.0,49.0,3.0,0.977504,2023-06-12 18:59:53
4064,2023-04-01,Morgan County,WV,All Residential,23.0,19.0,68.0,58.0,3.0,0.987024,2023-06-12 18:59:53


In [9]:
today = datetime.date.today()
date_string = today.strftime("%Y-%m")
file_path = f"data/redfin_county_data_{date_string}.xlsx"

In [10]:
df.to_excel(file_path, index=False, sheet_name='redfin_county_data')
