### This file is dedicated to create a dataset useful for rental price prediction based on suburb

Created by Yuecheng Wang 16-09-2024

In [34]:
import json
import pandas as pd

In [35]:
past_data = pd.ExcelFile("../../data/raw/domain/past_data.xlsx")

print("Available sheets:", past_data.sheet_names)

all_properties_df = pd.read_excel(past_data, sheet_name='All properties')

Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [49]:
# Load the postcode dictionary from the JSON file
with open('../../data/raw/suburb_to_postcodes.json', 'r') as f:
    postcode_to_suburb = json.load(f)

In [38]:
domain_data = pd.read_csv("../../data/curated/individual_property_final.csv")

Deal with domain data

In [39]:
domain_data.info

<bound method DataFrame.info of                                              Address  Bedrooms  Bathrooms  \
0         901/22-40 Wills Street, Melbourne VIC 3000       1.0        1.0   
1           1207/270 King Street, Melbourne VIC 3000       2.0        2.0   
2      5809/442 ELIZABETH STREET, Melbourne VIC 3000       2.0        1.0   
3       2112/80 A'beckett Street, Melbourne VIC 3000       2.0        2.0   
4       1210/81 A'beckett Street, Melbourne VIC 3000       2.0        1.0   
...                                              ...       ...        ...   
7459             79 Watt Street, Wonthaggi VIC 3995       1.0        1.0   
7460    120-127 Mc Kenzie Street, Wonthaggi VIC 3995       1.0        1.0   
7461          35 Anderson Avenue, Inverloch VIC 3996       3.0        1.0   
7462            3 Kennards Court, Inverloch VIC 3996       3.0        1.0   
7463  Cabin 2/18 Williams Street, Inverloch VIC 3996       1.0        1.0   

     Closest Gov Secondary School  Age unde

In [57]:
# Define the columns that should be aggregated (only numeric columns)
numeric_columns = [
    'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 
    'income_2020', 'CBD Distance', 'Train Distance', 'Electricity Distance', 
    'Hospital Distance', 'Library Distance', 'Park Distance', 'Tourist Attraction Distance',
    'Grocery Distance', 'Incidents Recorded', 'Cost', 'Gov Secondary Distance'
]

# Initialize an empty list to collect suburb data
suburb_data = []

# Iterate over each suburb and its list of postcodes
for suburb, postcodes in postcode_to_suburb.items():
    # Filter the domain_data for the matching postcodes
    filtered_df = domain_data[domain_data['Postcode'].isin(postcodes)]
    
    # If no properties match, skip this suburb
    if filtered_df.empty:
        continue
    
    # Initialize a dictionary to store the suburb's aggregated data
    suburb_aggregates = {}
    
    # Loop through each numeric column and calculate the mean of positive values
    for col in numeric_columns:
        # Filter for positive values in the current column
        positive_values = filtered_df[col][filtered_df[col] > 0]
        
        # If there are positive values, calculate the mean, otherwise set to -1
        if not positive_values.empty:
            suburb_aggregates[f'avg_{col}'] = positive_values.mean()
        else:
            suburb_aggregates[f'avg_{col}'] = -1
    
    # Add the property count for this suburb
    suburb_aggregates['property_count'] = filtered_df.shape[0]
    
    # Add the suburb name to the result
    suburb_aggregates['Suburb'] = suburb
    
    # Append the suburb's data to the list
    suburb_data.append(suburb_aggregates)

# Convert the list of suburb data into a new dataframe
suburb_aggregates_df = pd.DataFrame(suburb_data)

# Show the resulting dataframe
suburb_aggregates_df.head()



Unnamed: 0,avg_Age under 20,avg_Age 20-39,avg_Age 40-59,avg_Age 60+,avg_income_2020,avg_CBD Distance,avg_Train Distance,avg_Electricity Distance,avg_Hospital Distance,avg_Library Distance,avg_Park Distance,avg_Tourist Attraction Distance,avg_Grocery Distance,avg_Incidents Recorded,avg_Cost,avg_Gov Secondary Distance,property_count,Suburb
0,18.806452,33.596774,30.225806,17.274194,102921.096774,6.132656,2.636765,5.496079,1.925724,1.060153,0.498894,0.464303,1.130171,9809.0,574.919355,1.581667,62,Albert Park-Middle Park-West St Kilda
1,20.342857,33.714286,26.714286,19.085714,114856.285714,9.177803,0.728883,7.041351,1.72816,1.802843,0.496551,2.3773,1.255434,8499.0,651.428571,3.064706,35,Armadale
2,15.882353,44.764706,23.705882,15.529412,97108.0,4.024541,1.956347,2.390635,2.399924,0.984165,0.732729,2.374994,1.637694,9533.0,673.235294,0.705882,17,Carlton North
3,8.098361,69.677419,13.854839,8.387097,65844.16129,2.628034,1.402115,1.579789,0.800744,1.270156,0.839498,1.537342,0.938795,25019.0,580.080645,0.908197,62,Carlton-Parkville
4,8.337662,60.402597,19.363636,12.133333,82789.597403,2.519861,1.318673,3.076608,1.07277,1.515464,0.780371,1.111722,0.956004,19488.090909,649.415584,1.746154,77,CBD-St Kilda Rd


In [58]:
suburb_aggregates_df.to_csv("../../data/curated/aggregate_property_suburb.csv")