In [1]:
import pandas as pd
import os

In [2]:
# Create folder to save dataset
base_dir = '../../data/'
raw_dir = os.path.join(base_dir, 'raw')
curated_dir = os.path.join(base_dir, 'curated')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

if not os.path.exists(curated_dir):
    os.makedirs(curated_dir)


subfolder = 'cost_suburb'

if not os.path.exists(os.path.join(raw_dir, subfolder)):
    os.makedirs(os.path.join(raw_dir, subfolder))

In [3]:
# read data
df = pd.read_csv(f"{curated_dir}/individual_property_preprocessed.csv")

In [4]:
# past data of median price of properties by suburb
past_data = pd.ExcelFile("../../data/raw/domain/past_data.xlsx")

print("Available sheets:", past_data.sheet_names)

Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [5]:
df.shape

(80470, 29)

# aggregate property by suburb to extract median_price_by_suburb_2024

In [6]:
# only need the cost data of 2024
df_2024 = df[df['Year'] == 2024]
df_2024.shape
df_2024.iloc[2]

Address                         7/7-9 Faussett Street, Albert Park VIC 3206
Cost                                                                  490.0
Property Type                                                     Apartment
Bedrooms                                                                1.0
Bathrooms                                                               1.0
Latitude                                                          -37.84167
Longitude                                                        144.955332
Closest Gov Secondary School                            Albert Park College
Age under 20                                                           20.0
Age 20-39                                                              26.0
Age 40-59                                                              29.0
Age 60+                                                                25.0
Postcode                                                             3206.0
CBD Distance

In [7]:
# Initialize an empty list to collect suburb data
suburb_data = []

# Get the unique suburbs from the 'Suburb' column in df_2024
unique_suburbs = df_2024['Suburb'].unique()

# Iterate over each unique suburb
for suburb in unique_suburbs:
    # Filter the data for the matching suburb
    filtered_df = df_2024[df_2024['Suburb'] == suburb]
    
    # If no properties match, skip this suburb
    if filtered_df.empty:
        continue
    
    # Initialize a dictionary to store the suburb's aggregated data
    suburb_aggregates = {}
    
    # Find the median of Cost using filtered_df
    suburb_aggregates['median_cost'] = filtered_df['Cost'].median()

    # Add the property count for this suburb
    suburb_aggregates['property_count'] = filtered_df.shape[0]
    
    # Add the suburb name to the result
    suburb_aggregates['Suburb'] = suburb
    
    # Append the suburb's data to the list
    suburb_data.append(suburb_aggregates)

# Convert the list of suburb data into a new dataframe
suburb_aggregates_df = pd.DataFrame(suburb_data)

# Show the resulting dataframe
suburb_aggregates_df.head()


Unnamed: 0,median_cost,property_count,Suburb
0,565.0,75,Albert Park-Middle Park-West St Kilda
1,675.0,35,Armadale
2,615.0,14,Carlton North
3,600.0,81,Carlton-Parkville
4,650.0,98,CBD-St Kilda Rd


In [8]:
# Save the DataFrame if needed
suburb_aggregates_df.to_csv(f"{raw_dir}/{subfolder}/cost_suburb_2024.csv", index=False)

# Extract median price by suburb at past

In [9]:
output_relative_dir = '../../data/raw/domain/past_data_sheets'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

In [10]:
# Function to clean the data by replacing '-' with the closest available value for each suburb
def fill_missing_with_closest(df):
    for i in range(df.shape[1]):
        column = df.iloc[:, i]
        column.replace('-', pd.NA, inplace=True)
        column.fillna(method='ffill', inplace=True)  # Forward fill
        column.fillna(method='bfill', inplace=True)  # Backward fill
    return df

# Function to convert time periods to decimals like 2000.00, 2000.25, 2000.50, and 2000.75
def convert_time_to_decimal(time_series):
    time_decimal = []
    for time in time_series:
        if pd.isna(time):
            continue
        time = str(time)
        year, month = int(time[-4:]), time[:3]
        if month == 'Mar':
            time_decimal.append(year)
        elif month == 'Jun':
            time_decimal.append(year + 0.25)
        elif month == 'Sep':
            time_decimal.append(year + 0.50)
        elif month == 'Dec':
            time_decimal.append(year + 0.75)
    return time_decimal

In [11]:
# Initialize an empty DataFrame to store the combined results
combined_data = pd.DataFrame()

# Loop through each sheet in the Excel file
sheet = 'All properties'
print(f"Processing sheet: {sheet}")

# Load the data from the current sheet
all_properties_df = pd.read_excel(past_data, sheet_name=sheet)

# Extract suburb names and relevant rent data
suburb_col = all_properties_df.iloc[2:, 1]  # Suburb names
rent_data = all_properties_df.iloc[2:, 84:].iloc[:, 1::2]  # Every second column (median values)

# Fill missing values
rent_data_cleaned = fill_missing_with_closest(rent_data)

# Extract the time periods and convert them to year format with decimal quarters
time_periods = pd.Series(all_properties_df.iloc[0, 82:].iloc[::2])
time_decimal = convert_time_to_decimal(time_periods)

# Prepare data for each suburb
for i, suburb in enumerate(suburb_col):
    rents = rent_data_cleaned.iloc[i].dropna().astype(float).values
    years = time_decimal[:len(rents)]
    
    if len(rents) == len(years):  # Only include matching year-rent pairs
        temp_df = pd.DataFrame({'suburb': suburb, 'year': years, 'median_rent': rents})
        
        # Append the data to the combined dataset
        combined_data = pd.concat([combined_data, temp_df], ignore_index=True)

# Now that all data has been appended, group by 'suburb' and 'year'
# Convert the 'year' column to integers to group by the whole year
combined_data['year'] = combined_data['year'].astype(int)

# Group by 'suburb' and 'year' and calculate the mean median_rent for each group
combined_grouped_data = combined_data.groupby(['suburb', 'year'], as_index=False)['median_rent'].mean()

# Save the final grouped data to a CSV file
output_file = f"{output_relative_dir}/historical_rent_price.csv"
combined_grouped_data.to_csv(output_file, index=False)

print(f"Combined data saved to {output_file}")

# Display the first few rows of the final combined data for verification
print(combined_grouped_data.head())

Processing sheet: All properties
Combined data saved to ../../data/raw/domain/past_data_sheets/historical_rent_price.csv
                                  suburb  year  median_rent
0  Albert Park-Middle Park-West St Kilda  2010        435.0
1  Albert Park-Middle Park-West St Kilda  2011        447.5
2  Albert Park-Middle Park-West St Kilda  2012        465.0
3  Albert Park-Middle Park-West St Kilda  2013        459.0
4  Albert Park-Middle Park-West St Kilda  2014        477.5


  column.fillna(method='ffill', inplace=True)  # Forward fill
  column.fillna(method='ffill', inplace=True)  # Forward fill
  column.fillna(method='bfill', inplace=True)  # Backward fill


# Relative Property Price Index

- Compare the price of individual property in 2024 with median price by suburb in 2024

- Relative Property Price Index = (Price of individual property / Median property price by suburb) * 100%


In [12]:
# Merge the median prices with the original df_2024 based on the 'Suburb' column
df_merged = pd.merge(df_2024, suburb_aggregates_df, on='Suburb', how='left')

# Calculate the Relative Property Price Index for each property
df_merged['relative_price_index'] = df_merged['Cost'] / df_merged['median_cost']

# Show the resulting dataframe with relevant columns
df_merged.iloc[1]

Address                         3/33 Bevan Street, Albert Park VIC 3206
Cost                                                              550.0
Property Type                                                 Apartment
Bedrooms                                                            1.0
Bathrooms                                                           1.0
Latitude                                                     -37.839959
Longitude                                                    144.956373
Closest Gov Secondary School                        Albert Park College
Age under 20                                                       28.0
Age 20-39                                                          18.0
Age 40-59                                                          28.0
Age 60+                                                            26.0
Postcode                                                         3206.0
CBD Distance                                                    

In [13]:
# Optionally save the results to a CSV file
df_merged.to_csv(f"{raw_dir}/{subfolder}/relative_price_index_2024.csv", index=False)

# Price of individual property at past

Generated base on Relative Property Price Index and median price by suburb at past

In [14]:
historical_rent_price = pd.read_csv('../../data/raw/domain/past_data_sheets/historical_rent_price.csv')
historical_rent_price.head(3)

Unnamed: 0,suburb,year,median_rent
0,Albert Park-Middle Park-West St Kilda,2010,435.0
1,Albert Park-Middle Park-West St Kilda,2011,447.5
2,Albert Park-Middle Park-West St Kilda,2012,465.0


In [15]:
df_merged.iloc[1]

Address                         3/33 Bevan Street, Albert Park VIC 3206
Cost                                                              550.0
Property Type                                                 Apartment
Bedrooms                                                            1.0
Bathrooms                                                           1.0
Latitude                                                     -37.839959
Longitude                                                    144.956373
Closest Gov Secondary School                        Albert Park College
Age under 20                                                       28.0
Age 20-39                                                          18.0
Age 40-59                                                          28.0
Age 60+                                                            26.0
Postcode                                                         3206.0
CBD Distance                                                    

In [16]:
# Merge property_data with historical_rent_price on 'Suburb' and 'Year'
df_merged_past = pd.merge(df, historical_rent_price, left_on=['Suburb', 'Year'], right_on=['suburb', 'year'], how='left')
df_merged_past = df_merged_past.drop(['suburb', 'year'], axis=1)
df_merged_past.iloc[1]

Address                         8/90 Hambleton Street, Middle Park VIC 3206
Cost                                                                    NaN
Property Type                                                     Apartment
Bedrooms                                                                1.0
Bathrooms                                                               1.0
Latitude                                                         -37.847553
Longitude                                                        144.960477
Closest Gov Secondary School                            Albert Park College
Age under 20                                                           36.0
Age 20-39                                                              15.0
Age 40-59                                                              30.0
Age 60+                                                                19.0
Postcode                                                             3206.0
CBD Distance

In [17]:
# Merge property_data with historical rent price and relative_price_index based on 'Address'
df_price = pd.merge(df_merged_past, df_merged[['Address', 'relative_price_index']], on='Address', how='left')

df_price.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80756 entries, 0 to 80755
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Address                       80756 non-null  object 
 1   Cost                          6212 non-null   float64
 2   Property Type                 80756 non-null  object 
 3   Bedrooms                      80756 non-null  float64
 4   Bathrooms                     80756 non-null  float64
 5   Latitude                      80756 non-null  float64
 6   Longitude                     80756 non-null  float64
 7   Closest Gov Secondary School  80756 non-null  object 
 8   Age under 20                  80756 non-null  float64
 9   Age 20-39                     80756 non-null  float64
 10  Age 40-59                     80756 non-null  float64
 11  Age 60+                       80756 non-null  float64
 12  Postcode                      80756 non-null  float64
 13  C

In [18]:
# fill in the past rental price base on relative_price_index and the median_rent in each year

# Step 1: Fill missing Cost for years < 2024 using the formula: Cost = relative_price_index * median_rent
df_price.loc[df_price['Year'] < 2024, 'Cost'] = df_price['relative_price_index'] * df_price['median_rent']
df_price = df_price.drop(['median_rent', 'relative_price_index'], axis = 1)
df_price.iloc[1]



Address                         8/90 Hambleton Street, Middle Park VIC 3206
Cost                                                             373.716814
Property Type                                                     Apartment
Bedrooms                                                                1.0
Bathrooms                                                               1.0
Latitude                                                         -37.847553
Longitude                                                        144.960477
Closest Gov Secondary School                            Albert Park College
Age under 20                                                           36.0
Age 20-39                                                              15.0
Age 40-59                                                              30.0
Age 60+                                                                19.0
Postcode                                                             3206.0
CBD Distance

In [19]:
# sava data
df_price.to_csv(f"{curated_dir}/individual_property_final.csv", index=False)