In [58]:
import pandas as pd
import os

In [59]:
# Create folder to save dataset
base_dir = '../../data/'
raw_dir = os.path.join(base_dir, 'raw')
curated_dir = os.path.join(base_dir, 'curated')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

if not os.path.exists(curated_dir):
    os.makedirs(curated_dir)


subfolder = 'cost_suburb'

if not os.path.exists(os.path.join(raw_dir, subfolder)):
    os.makedirs(os.path.join(raw_dir, subfolder))

In [60]:
# read data
df = pd.read_csv(f"{curated_dir}/individual_property_preprocessed.csv")

In [61]:
# past data of median price of properties by suburb
past_data = pd.ExcelFile("../../data/raw/domain/past_data.xlsx")

print("Available sheets:", past_data.sheet_names)

Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [62]:
df.shape

(80496, 29)

# aggregate property by suburb to extract median_price_by_suburb_2024

In [63]:
# only need the cost data of 2024
df_2024 = df[df['Year'] == 2024]
df_2024.shape
df_2024.iloc[2]

Address                         7/7-9 Faussett Street, Albert Park VIC 3206
Cost                                                                  490.0
Property Type                                                     Apartment
Bedrooms                                                                1.0
Bathrooms                                                               1.0
Latitude                                                          -37.84167
Longitude                                                        144.955332
Closest Gov Secondary School                            Albert Park College
Age under 20                                                           20.0
Age 20-39                                                              26.0
Age 40-59                                                              29.0
Age 60+                                                                25.0
Postcode                                                             3206.0
CBD Distance

In [64]:
# Initialize an empty list to collect suburb data
suburb_data = []

# Get the unique suburbs from the 'Suburb' column in df_2024
unique_suburbs = df_2024['Suburb'].unique()

# Iterate over each unique suburb
for suburb in unique_suburbs:
    # Filter the data for the matching suburb
    filtered_df = df_2024[df_2024['Suburb'] == suburb]
    
    # If no properties match, skip this suburb
    if filtered_df.empty:
        continue
    
    # Initialize a dictionary to store the suburb's aggregated data
    suburb_aggregates = {}
    
    # Find the median of Cost using filtered_df
    suburb_aggregates['median_cost'] = filtered_df['Cost'].median()

    # Add the property count for this suburb
    suburb_aggregates['property_count'] = filtered_df.shape[0]
    
    # Add the suburb name to the result
    suburb_aggregates['Suburb'] = suburb
    
    # Append the suburb's data to the list
    suburb_data.append(suburb_aggregates)

# Convert the list of suburb data into a new dataframe
suburb_aggregates_df = pd.DataFrame(suburb_data)

# Show the resulting dataframe
suburb_aggregates_df.head()


Unnamed: 0,median_cost,property_count,Suburb
0,565.0,75,Albert Park-Middle Park-West St Kilda
1,675.0,35,Armadale
2,615.0,14,Carlton North
3,600.0,81,Carlton-Parkville
4,650.0,98,CBD-St Kilda Rd


In [65]:
# Save the DataFrame if needed
suburb_aggregates_df.to_csv(f"{raw_dir}/{subfolder}/cost_suburb_2024.csv", index=False)

# extract the past median price of each suburb

# Relative Property Price Index

- Compare the price of individual property in 2024 with median price by suburb in 2024

- Relative Property Price Index = (Price of individual property / Median property price by suburb) * 100%


In [66]:
# Merge the median prices with the original df_2024 based on the 'Suburb' column
df_merged = pd.merge(df_2024, suburb_aggregates_df, on='Suburb', how='left')

# Calculate the Relative Property Price Index for each property
df_merged['relative_price_index'] = df_merged['Cost'] / df_merged['median_cost']

# Show the resulting dataframe with relevant columns
df_merged[['Suburb', 'Postcode', 'Cost', 'median_cost', 'relative_price_index']].head()

Unnamed: 0,Suburb,Postcode,Cost,median_cost,relative_price_index
0,Albert Park-Middle Park-West St Kilda,3206.0,410.0,565.0,0.725664
1,Albert Park-Middle Park-West St Kilda,3206.0,550.0,565.0,0.973451
2,Albert Park-Middle Park-West St Kilda,3206.0,490.0,565.0,0.867257
3,Albert Park-Middle Park-West St Kilda,3206.0,350.0,565.0,0.619469
4,Albert Park-Middle Park-West St Kilda,3206.0,450.0,565.0,0.79646


In [None]:
# Optionally save the results to a CSV file
df_merged.to_csv(f"{raw_dir}/{subfolder}/relative_price_index_2024.csv", index=False)
