### This file is dedicated to create a dataset useful for rental price prediction based on suburb

Created by Yuecheng Wang 16-09-2024

In [1]:
import pandas as pd
import re

In [9]:
past_data = pd.ExcelFile("../../data/raw/domain/past_data.xlsx")

print("Available sheets:", past_data.sheet_names)

all_properties_df = pd.read_excel(past_data, sheet_name='All properties')

Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [3]:
domain_data = pd.read_csv("../../data/raw/domain/all_postcodes.csv")

With how annoying official document is, I found this on https://github.com/schappim/australian-postcodes/blob/master/australian-postcodes-2021-04-23.csv \
This is a file just contain postcode and name, and they are correct according to official document.

In [4]:
url = "https://raw.githubusercontent.com/schappim/australian-postcodes/master/australian-postcodes-2021-04-23.csv"

postcode_name = pd.read_csv(url)

print(postcode_name.head())

        Suburb State   Zip
0  AARONS PASS   NSW  2850
1   ABBA RIVER    WA  6280
2        ABBEY    WA  6280
3     ABBEYARD   VIC  3737
4     ABBEYARD   VIC  3737


In [5]:
postcode_name.rename(columns={'Zip': 'Postcode'}, inplace=True)

# Merge the main DataFrame with the suburb DataFrame based on the 'Postcode' column
merged_df = domain_data.merge(postcode_name[['Suburb', 'Postcode']], on='Postcode', how='left')

# Display the first few rows of the merged DataFrame
print(merged_df.head())

                                         Address  \
0     901/22-40 Wills Street, Melbourne VIC 3000   
1       1207/270 King Street, Melbourne VIC 3000   
2  5809/442 ELIZABETH STREET, Melbourne VIC 3000   
3   2112/80 A'beckett Street, Melbourne VIC 3000   
4   1210/81 A'beckett Street, Melbourne VIC 3000   

                               Cost  Bedrooms  Bathrooms  \
0                     $600 per week       1.0        1.0   
1                     $720 per week       2.0        2.0   
2  $850 Per Week ( Fully Furnished)       2.0        1.0   
3                     $700 per week       2.0        2.0   
4                       $650 weekly       2.0        1.0   

                  Coordinates Closest Gov Secondary School  \
0  [-37.8107551, 144.9570001]       University High School   
1  [-37.8136918, 144.9548583]       University High School   
2  [-37.8084101, 144.9607759]       University High School   
3  [-37.8089991, 144.9610792]       University High School   
4   [-37.8092536

In [8]:
# Parse cost
merged_df['Cost'] = merged_df['Cost'].apply(lambda x: float(re.search(r'\$(\d+(\.\d+)?)', x).group(1)) if pd.notnull(x) and re.search(r'\$(\d+(\.\d+)?)', x) else None)
# Remove no cost 
merged_df = merged_df.dropna(subset=['Cost'])

merged_df.head(5)

Unnamed: 0,Address,Cost,Bedrooms,Bathrooms,Coordinates,Closest Gov Secondary School,Gov Secondary Distance,Age under 20,Age 20-39,Age 40-59,Age 60+,Postcode,Suburb
0,"901/22-40 Wills Street, Melbourne VIC 3000",600.0,1.0,1.0,"[-37.8107551, 144.9570001]",University High School,1.5 km away,8%,77%,12%,3%,3000,MELBOURNE
1,"1207/270 King Street, Melbourne VIC 3000",720.0,2.0,2.0,"[-37.8136918, 144.9548583]",University High School,1.9 km away,6%,83%,10%,1%,3000,MELBOURNE
2,"5809/442 ELIZABETH STREET, Melbourne VIC 3000",850.0,2.0,1.0,"[-37.8084101, 144.9607759]",University High School,1.3 km away,3%,90%,7%,0%,3000,MELBOURNE
3,"2112/80 A'beckett Street, Melbourne VIC 3000",700.0,2.0,2.0,"[-37.8089991, 144.9610792]",University High School,1.4 km away,3%,90%,7%,0%,3000,MELBOURNE
4,"1210/81 A'beckett Street, Melbourne VIC 3000",650.0,2.0,1.0,"[-37.8092536, 144.961181]",University High School,1.4 km away,6%,79%,12%,3%,3000,MELBOURNE


In [15]:
merged_df['Suburb'] = merged_df['Suburb'].str.lower()
merged_df['Suburb'].nunique()

1735

In [18]:
all_properties_suburbs = all_properties_df['Unnamed: 1']
# Split suburbs by the hyphen '-' and create a list of individual suburbs
all_properties_suburbs_split = all_properties_suburbs.dropna().str.split('-').explode().str.strip().str.lower()

print(all_properties_suburbs_split.nunique())
# Assuming merged_df['suburb'] contains the suburbs to compare
# Compare merged_df['suburb'] with the split and cleaned all_properties_suburbs
missing_suburbs = merged_df[~merged_df['Suburb'].isin(all_properties_suburbs_split)]

# Display how many suburbs from merged_df are not in "All properties"
print(f"Number of suburbs not found in 'All properties': {missing_suburbs['Suburb'].nunique()}")

# Display the missing suburbs for review
missing_suburbs_list = missing_suburbs['Suburb'].unique()
print(f"Missing suburbs: {missing_suburbs_list}")

216
Number of suburbs not found in 'All properties': 1569
Missing suburbs: ['melbourne' 'st kilda road central' 'st kilda road melbourne' ...
 'wattle bank' 'inverloch' 'pound creek']
