### This file is dedicated to create a dataset useful for rental price prediction based on suburb

Created by Yuecheng Wang 16-09-2024

In [1]:
import pandas as pd
import re

In [9]:
past_data = pd.ExcelFile("../../data/raw/domain/past_data.xlsx")

print("Available sheets:", past_data.sheet_names)

all_properties_df = pd.read_excel(past_data, sheet_name='All properties')

Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [25]:
domain_data = pd.read_csv("../../data/raw/domain/all_postcodes.csv")

Deal with domain data

In [21]:
domain_data.info

<bound method DataFrame.info of                                              Address  \
0         901/22-40 Wills Street, Melbourne VIC 3000   
1           1207/270 King Street, Melbourne VIC 3000   
2      5809/442 ELIZABETH STREET, Melbourne VIC 3000   
3       2112/80 A'beckett Street, Melbourne VIC 3000   
4       1210/81 A'beckett Street, Melbourne VIC 3000   
...                                              ...   
7997             79 Watt Street, Wonthaggi VIC 3995   
7998    120-127 Mc Kenzie Street, Wonthaggi VIC 3995   
7999          35 Anderson Avenue, Inverloch VIC 3996   
8000            3 Kennards Court, Inverloch VIC 3996   
8001  Cabin 2/18 Williams Street, Inverloch VIC 3996   

                                  Cost  Bedrooms  Bathrooms  \
0                        $600 per week       1.0        1.0   
1                        $720 per week       2.0        2.0   
2     $850 Per Week ( Fully Furnished)       2.0        1.0   
3                        $700 per week     

In [26]:
# Parse cost
domain_data['Cost'] = domain_data['Cost'].apply(lambda x: float(re.search(r'\$(\d+(\.\d+)?)', x).group(1)) if pd.notnull(x) and re.search(r'\$(\d+(\.\d+)?)', x) else None)
# Remove no cost 
domain_data = domain_data.dropna(subset=['Cost'])

domain_data.info

<bound method DataFrame.info of                                              Address    Cost  Bedrooms  \
0         901/22-40 Wills Street, Melbourne VIC 3000   600.0       1.0   
1           1207/270 King Street, Melbourne VIC 3000   720.0       2.0   
2      5809/442 ELIZABETH STREET, Melbourne VIC 3000   850.0       2.0   
3       2112/80 A'beckett Street, Melbourne VIC 3000   700.0       2.0   
4       1210/81 A'beckett Street, Melbourne VIC 3000   650.0       2.0   
...                                              ...     ...       ...   
7997             79 Watt Street, Wonthaggi VIC 3995  1950.0       1.0   
7998    120-127 Mc Kenzie Street, Wonthaggi VIC 3995    77.0       1.0   
7999          35 Anderson Avenue, Inverloch VIC 3996   450.0       3.0   
8000            3 Kennards Court, Inverloch VIC 3996   435.0       3.0   
8001  Cabin 2/18 Williams Street, Inverloch VIC 3996   350.0       1.0   

      Bathrooms                 Coordinates Closest Gov Secondary School  \
0  

In [30]:
Q1, Q3 = domain_data['Cost'].quantile([0.25, 0.75])
IQR = Q3 - Q1
domain_data = domain_data[(domain_data['Cost'] >= Q1 - 1.5 * IQR) & (domain_data['Cost'] <= Q3 + 1.5 * IQR)]

In [31]:
max_cost = domain_data['Cost'].max()
min_cost = domain_data['Cost'].min()
mean_cost = domain_data['Cost'].mean()

print(f"Max Cost: ${max_cost}")
print(f"Min Cost: ${min_cost}")
print(f"Mean Cost: ${mean_cost}")

Max Cost: $950.0
Min Cost: $150.0
Mean Cost: $566.0803051709522


1735

In [18]:
all_properties_suburbs = all_properties_df['Unnamed: 1']
# Split suburbs by the hyphen '-' and create a list of individual suburbs
all_properties_suburbs_split = all_properties_suburbs.dropna().str.split('-').explode().str.strip().str.lower()

print(all_properties_suburbs_split.nunique())
# Assuming merged_df['suburb'] contains the suburbs to compare
# Compare merged_df['suburb'] with the split and cleaned all_properties_suburbs
missing_suburbs = merged_df[~merged_df['Suburb'].isin(all_properties_suburbs_split)]

# Display how many suburbs from merged_df are not in "All properties"
print(f"Number of suburbs not found in 'All properties': {missing_suburbs['Suburb'].nunique()}")

# Display the missing suburbs for review
missing_suburbs_list = missing_suburbs['Suburb'].unique()
print(f"Missing suburbs: {missing_suburbs_list}")

216
Number of suburbs not found in 'All properties': 1569
Missing suburbs: ['melbourne' 'st kilda road central' 'st kilda road melbourne' ...
 'wattle bank' 'inverloch' 'pound creek']
