This notebook preprocesses the current listings of domain and the historical suburban rent dataset.

By running all the cells in this notebook, any cleaned dataset will be saved in either the landing or curated folder.

Before running this notebook, make sure you have:
- 'domain_current_listings.parquet' and 'Moving annual rent by suburb - March quarter 2023.xlsx' in the raw folder,
- 'schools_by_region.csv', 'distances.csv', 'crime.csv' and 'land_cover.csv' in the landing folder.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_extract, regexp_replace
import pandas as pd

In [2]:
# Create a SparkSession
spark = (SparkSession.builder.appName("Project 2")
         .config("spark.sql.repl.eagerEval.enabled", True)
         .config("spark.sql.parquet.cacheMetadata", "true")
         .config("spark.sql.session.timeZone", "Etc/UTC")
         .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/10/05 19:29:44 WARN Utils: Your hostname, DESKTOP-JJQB7CC resolves to a loopback address: 127.0.1.1; using 172.30.3.30 instead (on interface eth0)
24/10/05 19:29:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 19:29:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read the parquet file 
domain_current = spark.read.parquet('../data/raw/domain_current_listings.parquet')
domain_current.limit(5)

                                                                                

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197


In [4]:
# Count the number of properties
domain_current.count()

14199

In [5]:
# Print the data types of the columns
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [6]:
# Dectect the rows that have missing values in the columns 'name', 'cost_text', 'type', 'latitude' and 'longitude'
domain_current.where(domain_current['name'].isNull()).count(), domain_current.where(domain_current['cost_text'].isNull()).count(), domain_current.where(domain_current['type'].isNull()).count(), domain_current.where(domain_current['latitude'].isNull()).count(), domain_current.where(domain_current['longitude'].isNull()).count()

(0, 0, 0, 0, 0)

In [7]:
# Drop the rows that have missing values in the columns 'bedrooms', 'bathrooms' and 'car_parks'
domain_current = domain_current.dropna(subset=['bedrooms', 'bathrooms', 'car_parks'])
domain_current.count()

14166

In [8]:
# Creat a new column 'suburb' by extracting the suburbs from 'name'
domain_current = domain_current.withColumn('suburb', split(domain_current['name'], ',')[1])
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181


In [9]:
# Create a new column 'postcode' by extracting the postcodes from 'suburb'
domain_current = domain_current.withColumn('postcode', regexp_extract(domain_current['suburb'], r'\b(\d{4})\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181


In [10]:
# Create a new column 'rent_pw' by extracting the rents from 'cost_text'
domain_current = domain_current.withColumn('rent_pw', regexp_extract(regexp_replace(domain_current['cost_text'], ',', ''), r'\b(\d+)\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode,rent_pw
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008,600
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087,650
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141,1150
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220,475
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181,2200


In [11]:
# Convert the data type of 'rent_pw' to integer
domain_current = domain_current.withColumn('rent_pw', domain_current['rent_pw'].cast('int'))
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- rent_pw: integer (nullable = true)



In [12]:
# Dectect the rows that have missing values in the column 'rent_pw'
domain_current.where(domain_current['rent_pw'].isNull()).count()

838

In [13]:
# Drop the the rows that have missing values in the column 'rent_pw'
domain_current = domain_current.dropna(subset=['rent_pw'])
domain_current.count()

13328

In [14]:
# Convert 'domain_current' to a pandas DataFrame
domain_current = domain_current.toPandas()

In [None]:
# Display the full list
pd.set_option('display.max_rows', None)

# Detect the rows with any 'annual' or 'month' or 'season' in 'cost_text'
domain_current[domain_current['cost_text'].str.contains('annual|month|season', case=False)]

In [16]:
# Manually correct 'rent_pw'
domain_current.loc[25, 'rent_pw'] = 507
domain_current.loc[154, 'rent_pw'] = 350
domain_current.loc[473, 'rent_pw'] = 70
domain_current.loc[927, 'rent_pw'] = 70
domain_current.loc[1795, 'rent_pw'] = 3850
domain_current.loc[1827, 'rent_pw'] = 450
domain_current.loc[1912, 'rent_pw'] = 608
domain_current.loc[1929, 'rent_pw'] = 76
domain_current.loc[2308, 'rent_pw'] = 627
domain_current.loc[2693, 'rent_pw'] = 18
domain_current.loc[2865, 'rent_pw'] = 76
domain_current.loc[3514, 'rent_pw'] = 608
domain_current.loc[3756, 'rent_pw'] = 51
domain_current.loc[3813, 'rent_pw'] = 558
domain_current.loc[4004, 'rent_pw'] = 455
domain_current.loc[4081, 'rent_pw'] = 35
domain_current.loc[4372, 'rent_pw'] = 29
domain_current.loc[4446, 'rent_pw'] = 456
domain_current.loc[5021, 'rent_pw'] = 637
domain_current.loc[5167, 'rent_pw'] = 51
domain_current.loc[6169, 'rent_pw'] = 58
domain_current.loc[6223, 'rent_pw'] = 26
domain_current.loc[6769, 'rent_pw'] = 425
domain_current.loc[6996, 'rent_pw'] = 54
domain_current.loc[7515, 'rent_pw'] = 558
domain_current.loc[7564, 'rent_pw'] = 70
domain_current.loc[7658, 'rent_pw'] = 3500
domain_current.loc[7979, 'rent_pw'] = 558
domain_current.loc[8501, 'rent_pw'] = 58
domain_current.loc[9351, 'rent_pw'] = 3850
domain_current.loc[9808, 'rent_pw'] = 747
domain_current.loc[10585, 'rent_pw'] = 3500
domain_current.loc[10684, 'rent_pw'] = 627
domain_current.loc[11143, 'rent_pw'] = 456
domain_current.loc[11411, 'rent_pw'] = 54
domain_current.loc[11427, 'rent_pw'] = 61
domain_current.loc[11536, 'rent_pw'] = 56
domain_current.loc[11664, 'rent_pw'] = 425
domain_current.loc[11998, 'rent_pw'] = 26
domain_current.loc[12448, 'rent_pw'] = 70
domain_current.loc[12571, 'rent_pw'] = 506
domain_current.loc[12826, 'rent_pw'] = 3850
domain_current.loc[12996, 'rent_pw'] = 517

In [17]:
# List all the types of properties
domain_current['type'].unique()

array(['Apartment / Unit / Flat', 'Townhouse', 'House', 'Studio', 'Villa',
       'Terrace', 'Duplex', 'Carspace', 'New House & Land',
       'Semi-Detached', 'Acreage / Semi-Rural',
       'New Apartments / Off the Plan', 'New land', 'Block of Units'],
      dtype=object)

In [18]:
# Remove the rows with 'Carspace' in 'type'
domain_current = domain_current[~domain_current['type'].str.contains('Carspace', case=False)]

In [19]:
# Save the cleaned data to a parquet file
domain_current = spark.createDataFrame(domain_current)
domain_current.write.mode('overwrite').parquet('../data/landing/cleaned_domain_current_listings.parquet')

24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 61.43% for 11 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 56.31% for 12 writers
24/10/05 19:29:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,

In [20]:
# Read the last page of the xlsx file
historical_rent = pd.read_excel('../data/raw/Moving annual rent by suburb - March quarter 2023.xlsx', sheet_name='All properties')
historical_rent.head()

Unnamed: 0,Moving annual rent by suburb,Unnamed: 1,Lease commenced in year ending,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183,Unnamed: 184,Unnamed: 185,Unnamed: 186,Unnamed: 187
0,All properties,,Mar 2000,,Jun 2000,,Sep 2000,,Dec 2000,,...,Mar 2022,,Jun 2022,,Sep 2022,,Dec 2022,,Mar 2023,
1,,,Count,Median,Count,Median,Count,Median,Count,Median,...,Count,Median,Count,Median,Count,Median,Count,Median,Count,Median
2,Inner Melbourne,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,1178,275,...,867,500,855,515,881,500,832,525,786,545
3,,Armadale,733,200,737,200,738,205,739,210,...,805,430,851,450,852,450,840,460,751,490
4,,Carlton North,864,260,814,260,799,265,736,270,...,581,580,535,595,547,600,546,600,490,620


In [21]:
# Drop the columns from 'Lease commenced in year ending' to 'Unnamed: 17' (years before 2002 with some missing values) and the fisrt column (irrelevant granular information)
historical_rent = historical_rent.drop(columns=historical_rent.columns[2:18])
historical_rent = historical_rent.drop(columns=historical_rent.columns[0])
historical_rent.head()

Unnamed: 0,Unnamed: 1,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,...,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183,Unnamed: 184,Unnamed: 185,Unnamed: 186,Unnamed: 187
0,,Mar 2002,,Jun 2002,,Sep 2002,,Dec 2003,,Mar 2003,...,Mar 2022,,Jun 2022,,Sep 2022,,Dec 2022,,Mar 2023,
1,,Count,Median,Count,Median,Count,Median,Count,Median,Count,...,Count,Median,Count,Median,Count,Median,Count,Median,Count,Median
2,Albert Park-Middle Park-West St Kilda,1332,300,1350,300,1415,300,1431,300,1422,...,867,500,855,515,881,500,832,525,786,545
3,Armadale,774,233,783,230,773,230,724,230,747,...,805,430,851,450,852,450,840,460,751,490
4,Carlton North,626,290,647,290,644,290,670,290,685,...,581,580,535,595,547,600,546,600,490,620


In [22]:
# Dorp the even columns (count information)
#historical_median = historical_rent.iloc[:, ::2]
#historical_median.head()

In [23]:
# Drop the first two rows and reset the index
historical_median = historical_rent.drop([0, 1]).reset_index(drop=True)
historical_median.head()

Unnamed: 0,Unnamed: 1,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,...,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183,Unnamed: 184,Unnamed: 185,Unnamed: 186,Unnamed: 187
0,Albert Park-Middle Park-West St Kilda,1332,300,1350,300,1415,300,1431,300,1422,...,867,500,855,515,881,500,832,525,786,545
1,Armadale,774,233,783,230,773,230,724,230,747,...,805,430,851,450,852,450,840,460,751,490
2,Carlton North,626,290,647,290,644,290,670,290,685,...,581,580,535,595,547,600,546,600,490,620
3,Carlton-Parkville,1305,270,1262,270,1267,275,1227,280,1297,...,6143,310,6018,319,6871,340,6627,350,6690,400
4,CBD-St Kilda Rd,3316,320,3418,320,3393,320,3404,320,3755,...,17845,365,16792,390,18284,419,17627,450,17426,500


In [24]:
# Rename the column 'Unnamed: 19' as '2002-03' and the rest of the columns in the format of 'year-month' as each quarter
historical_median.columns = ['suburb', 'count_2002-03', '2002-03', 'count_2002-06', '2002-06', 'count_2002-09', '2002-09', 'count_2002-12', '2002-12', 
                             'count_2003-03', '2003-03', 'count_2003-06', '2003-06', 'count_2003-09', '2003-09', 'count_2003-12', '2003-12', 
                             'count_2004-03', '2004-03', 'count_2004-06', '2004-06', 'count_2004-09', '2004-09', 'count_2004-12', '2004-12', 
                             'count_2005-03', '2005-03', 'count_2005-06', '2005-06', 'count_2005-09', '2005-09', 'count_2005-12', '2005-12', 
                             'count_2006-03', '2006-03', 'count_2006-06', '2006-06', 'count_2006-09', '2006-09', 'count_2006-12', '2006-12', 
                             'count_2007-03', '2007-03', 'count_2007-06', '2007-06', 'count_2007-09', '2007-09', 'count_2007-12', '2007-12', 
                             'count_2008-03', '2008-03', 'count_2008-06', '2008-06', 'count_2008-09', '2008-09', 'count_2008-12', '2008-12', 
                             'count_2009-03', '2009-03', 'count_2009-06', '2009-06', 'count_2009-09', '2009-09', 'count_2009-12', '2009-12', 
                             'count_2010-03', '2010-03', 'count_2010-06', '2010-06', 'count_2010-09', '2010-09', 'count_2010-12', '2010-12', 
                             'count_2011-03', '2011-03', 'count_2011-06', '2011-06', 'count_2011-09', '2011-09', 'count_2011-12', '2011-12', 
                             'count_2012-03', '2012-03', 'count_2012-06', '2012-06', 'count_2012-09', '2012-09', 'count_2012-12', '2012-12', 
                             'count_2013-03', '2013-03', 'count_2013-06', '2013-06', 'count_2013-09', '2013-09', 'count_2013-12', '2013-12', 
                             'count_2014-03', '2014-03', 'count_2014-06', '2014-06', 'count_2014-09', '2014-09', 'count_2014-12', '2014-12', 
                             'count_2015-03', '2015-03', 'count_2015-06', '2015-06', 'count_2015-09', '2015-09', 'count_2015-12', '2015-12', 
                             'count_2016-03', '2016-03', 'count_2016-06', '2016-06', 'count_2016-09', '2016-09', 'count_2016-12', '2016-12', 
                             'count_2017-03', '2017-03', 'count_2017-06', '2017-06', 'count_2017-09', '2017-09', 'count_2017-12', '2017-12', 
                             'count_2018-03', '2018-03', 'count_2018-06', '2018-06', 'count_2018-09', '2018-09', 'count_2018-12', '2018-12', 
                             'count_2019-03', '2019-03', 'count_2019-06', '2019-06', 'count_2019-09', '2019-09', 'count_2019-12', '2019-12', 
                             'count_2020-03', '2020-03', 'count_2020-06', '2020-06', 'count_2020-09', '2020-09', 'count_2020-12', '2020-12', 
                             'count_2021-03', '2021-03', 'count_2021-06', '2021-06', 'count_2021-09', '2021-09', 'count_2021-12', '2021-12', 
                             'count_2022-03', '2022-03', 'count_2022-06', '2022-06', 'count_2022-09', '2022-09', 'count_2022-12', '2022-12', 
                             'count_2023-03', '2023-03']
historical_median.head()

Unnamed: 0,suburb,count_2002-03,2002-03,count_2002-06,2002-06,count_2002-09,2002-09,count_2002-12,2002-12,count_2003-03,...,count_2022-03,2022-03,count_2022-06,2022-06,count_2022-09,2022-09,count_2022-12,2022-12,count_2023-03,2023-03
0,Albert Park-Middle Park-West St Kilda,1332,300,1350,300,1415,300,1431,300,1422,...,867,500,855,515,881,500,832,525,786,545
1,Armadale,774,233,783,230,773,230,724,230,747,...,805,430,851,450,852,450,840,460,751,490
2,Carlton North,626,290,647,290,644,290,670,290,685,...,581,580,535,595,547,600,546,600,490,620
3,Carlton-Parkville,1305,270,1262,270,1267,275,1227,280,1297,...,6143,310,6018,319,6871,340,6627,350,6690,400
4,CBD-St Kilda Rd,3316,320,3418,320,3393,320,3404,320,3755,...,17845,365,16792,390,18284,419,17627,450,17426,500


In [25]:
# Drop the rows 'Group Total'
historical_median = historical_median[~historical_median['suburb'].str.contains('Group Total')]

In [26]:
# Spelling correction
historical_median['suburb'] = historical_median['suburb'].replace('Wanagaratta', 'Wangaratta')
historical_median['suburb'] = historical_median['suburb'].replace('Newcombe', 'Newcomb')

In [27]:
# Save it to a csv file
historical_median.to_csv('../data/curated/historical_median_rent_by_suburb.csv', index=False)

In [28]:
# Create a new dataframe which only contains the median rent for the first quarter of each year as the median rent for the year
yearly_median = historical_median[['suburb', '2002-03', '2003-03', '2004-03', '2005-03', '2006-03', '2007-03', '2008-03', '2009-03', '2010-03', '2011-03', '2012-03', '2013-03', '2014-03', '2015-03', '2016-03', '2017-03', '2018-03', '2019-03', '2020-03', '2021-03', '2022-03', '2023-03']]

# Rename the columns
yearly_median.columns = ['suburb', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
yearly_median.head()

Unnamed: 0,suburb,2002,2003,2004,2005,2006,2007,2008,2009,2010,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Albert Park-Middle Park-West St Kilda,300,295,300,300,313,345,380,410,425,...,460,480,520,520,550,552,600,500,500,545
1,Armadale,233,235,240,240,250,280,320,350,360,...,390,400,400,420,480,480,500,450,430,490
2,Carlton North,290,290,298,300,320,330,380,400,420,...,510,530,530,530,562,580,585,580,580,620
3,Carlton-Parkville,270,280,285,281,275,275,290,310,340,...,355,351,373,395,410,435,404,350,310,400
4,CBD-St Kilda Rd,320,310,300,300,310,340,370,400,410,...,430,440,450,470,495,500,510,380,365,500


In [29]:
# Compute the yearly growth rate of the median rent for each suburb
yearly_growth = pd.DataFrame(yearly_median['suburb'])
for i in range(1, len(yearly_median.columns)-1):
    yearly_growth[str(2002+i)] = ((yearly_median[yearly_median.columns[i+1]] - yearly_median[yearly_median.columns[i]]) / yearly_median[yearly_median.columns[i]]) * 100

# Melt the dataframe to have the columns 'suburb', 'year' and 'growth_rate'
yearly_growth = yearly_growth.melt(id_vars='suburb', var_name='year', value_name='growth_rate')

# Convert the data type of 'year' to integer and 'growth_rate' to float
yearly_growth['year'] = yearly_growth['year'].astype('int')
yearly_growth['growth_rate'] = yearly_growth['growth_rate'].astype('float')
yearly_growth.head()

Unnamed: 0,suburb,year,growth_rate
0,Albert Park-Middle Park-West St Kilda,2003,-1.666667
1,Armadale,2003,0.858369
2,Carlton North,2003,0.0
3,Carlton-Parkville,2003,3.703704
4,CBD-St Kilda Rd,2003,-3.125


In [30]:
# Save it to a csv file
yearly_growth.to_csv('../data/curated/yearly_growth_rate_of_median_rent_by_suburb.csv', index=False)

In [31]:
# Melt the dataframe to have the columns 'suburb', 'year' and 'median_rent'
yearly_median = yearly_median.melt(id_vars='suburb', var_name='year', value_name='median_rent')

# Convert the data type of 'year' to integer and 'growth_rate' to integer
yearly_median['year'] = yearly_median['year'].astype('int')
yearly_median['median_rent'] = yearly_median['median_rent'].astype('int')
yearly_median.head()

Unnamed: 0,suburb,year,median_rent
0,Albert Park-Middle Park-West St Kilda,2002,300
1,Armadale,2002,233
2,Carlton North,2002,290
3,Carlton-Parkville,2002,270
4,CBD-St Kilda Rd,2002,320


In [32]:
# Number of suburbs in 'yearly_median'
yearly_median['suburb'].nunique()

146

In [33]:
# Read 'schools_by_region.csv'
schools_by_region = pd.read_csv('../data/landing/schools_by_region.csv')
schools_by_region.head()

Unnamed: 0,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,suburbs,best_school_school_name,avg_school_number_vce_subjects,avg_school_satisfactory_complete_vce_percent,avg_school_median_study_score,avg_school_study_score_over_40_percent,avg_school_percentage_applying_to_victorian_uni,zoned_school_number_vce_subjects,zoned_school_satisfactory_complete_vce_percent,zoned_school_median_study_score,zoned_school_study_score_over_40_percent,zoned_school_percentage_applying_to_victorian_uni
0,57.0,99.0,31.0,6.3,83.0,Albert Park-Middle Park-West St Kilda,albert park college,57.0,99.0,31.0,6.3,83.0,50.954001,98.48018,30.937726,6.764547,78.993583
1,50.0,99.0,30.0,7.8,84.0,Altona,mount st joseph girls college,46.0,99.5,28.75,4.675,69.5,11.841528,45.544337,12.752414,0.500988,27.782046
2,30.0,100.0,34.0,19.8,97.0,Armadale,lauriston girls school,30.0,100.0,34.0,19.8,97.0,32.206598,98.164926,31.139504,7.744409,92.19939
3,55.0,96.0,30.0,6.4,65.0,Aspendale-Chelsea-Carrum,mordialloc college,55.0,96.0,30.0,6.4,65.0,18.976452,33.122534,10.350792,2.208169,22.426716
4,45.0,95.0,28.0,3.0,38.0,Bairnsdale,nagle college,46.5,97.0,27.0,1.9,38.0,48.0,99.0,26.0,0.8,38.0


In [34]:
# Number of suburbs in 'schools_by_region'
schools_by_region['suburbs'].nunique()

144

In [35]:
# Dectect the differed suburbs between 'yearly_median' and 'schools_by_region'
suburbs_diff = set(yearly_median['suburb']) - set(schools_by_region['suburbs'])
suburbs_diff

{'North Bendigo', 'Yarra Ranges'}

In [36]:
# Remove the differed suburbs from 'yearly_median'
yearly_median = yearly_median[~yearly_median['suburb'].isin(suburbs_diff)]
yearly_median['suburb'].nunique()

144

In [37]:
# Impute NaN values using the median for each column respectively except 'suburbs' and 'best_school_school_name'
for column in schools_by_region.columns:
    if schools_by_region[column].dtype in ['float64', 'int64']:
        schools_by_region[column] = schools_by_region[column].fillna(schools_by_region[column].median())

In [38]:
# Join 'yearly_median' and 'schools_by_region'
yearly_median = yearly_median.merge(schools_by_region, left_on='suburb', right_on='suburbs', how='left')
yearly_median = yearly_median.drop(columns=['suburbs'])
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,avg_school_satisfactory_complete_vce_percent,avg_school_median_study_score,avg_school_study_score_over_40_percent,avg_school_percentage_applying_to_victorian_uni,zoned_school_number_vce_subjects,zoned_school_satisfactory_complete_vce_percent,zoned_school_median_study_score,zoned_school_study_score_over_40_percent,zoned_school_percentage_applying_to_victorian_uni
0,Albert Park-Middle Park-West St Kilda,2002,300,57.0,99.0,31.0,6.3,83.0,albert park college,57.0,99.0,31.0,6.3,83.0,50.954001,98.48018,30.937726,6.764547,78.993583
1,Armadale,2002,233,30.0,100.0,34.0,19.8,97.0,lauriston girls school,30.0,100.0,34.0,19.8,97.0,32.206598,98.164926,31.139504,7.744409,92.19939
2,Carlton North,2002,290,43.0,99.0,31.0,0.0,78.0,,42.583333,98.0,29.0,0.0,71.0,5.112352,8.978268,2.652416,0.834947,7.270972
3,Carlton-Parkville,2002,270,61.0,97.0,31.0,13.6,91.0,university high school,61.0,97.0,31.0,13.6,91.0,39.92019,64.324368,20.390769,8.240293,58.750247
4,CBD-St Kilda Rd,2002,320,9.0,100.0,29.0,5.7,54.0,holmes grammar school,9.0,100.0,29.0,5.7,54.0,58.909005,94.839003,29.617576,11.796245,85.3394


In [39]:
# Read 'distances.csv'
distances_cbd = pd.read_csv('../data/landing/distances.csv')
distances_cbd.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,geometry,suburbs,regions,code,centroid,crow_distance_to_cbd,distance_to_cbd,duration_to_cbd,route_to_cbd
0,0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']","[206051128, 206051514]",POINT (144.96815606060912 -37.85090369821219),4.535857,6702.5,811.4,"[[144.967419, -37.851489], [144.967423, -37.85..."
1,1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']","[213021341, 213021343]",POINT (144.82508757350203 -37.84854737942177),12.833585,17936.7,1656.4,"[[144.825016, -37.848943], [144.825517, -37.84..."
2,2,2,POLYGON ((145.01167433388778 -37.8535692509816...,Armadale,['Armadale'],[206061135],POINT (145.02071077926166 -37.8567469622587),7.261986,10138.0,1002.8,"[[145.020491, -37.856724], [145.020572, -37.85..."
3,3,3,POLYGON ((145.12548797012727 -38.0713448001564...,Aspendale-Chelsea-Carrum,"['Carrum - Patterson Lakes', 'Edithvale - Aspe...","[208031189, 208031186, 208031185]",POINT (145.12447097971767 -38.05297911022852),30.465132,50846.3,2914.2,"[[145.124414, -38.052875], [145.126879, -38.05..."
4,4,4,POLYGON ((147.57603348437306 -37.8302200300307...,Bairnsdale,['Bairnsdale'],[205021081],POINT (147.61398934302346 -37.8291507833843),233.444538,279492.3,12766.0,"[[147.613996, -37.829069], [147.614083, -37.82..."


In [40]:
# Drop the columns 'Unnamed: 0' and 'Unnamed: 1'
distances_cbd = distances_cbd.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

# Detect missing values
distances_cbd.isnull().sum()

geometry                0
suburbs                 0
regions                 0
code                    0
centroid                0
crow_distance_to_cbd    0
distance_to_cbd         0
duration_to_cbd         0
route_to_cbd            0
dtype: int64

In [41]:
# Join 'yearly_median' and 'distances_cbd'
yearly_median = yearly_median.merge(distances_cbd, left_on='suburb', right_on='suburbs', how='left')
yearly_median = yearly_median.drop(columns=['suburbs'])
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,...,zoned_school_study_score_over_40_percent,zoned_school_percentage_applying_to_victorian_uni,geometry,regions,code,centroid,crow_distance_to_cbd,distance_to_cbd,duration_to_cbd,route_to_cbd
0,Albert Park-Middle Park-West St Kilda,2002,300,57.0,99.0,31.0,6.3,83.0,albert park college,57.0,...,6.764547,78.993583,POLYGON ((144.97018001032677 -37.8606998481848...,"['Albert Park', 'St Kilda - West']","[206051128, 206051514]",POINT (144.96815606060912 -37.85090369821219),4.535857,6702.5,811.4,"[[144.967419, -37.851489], [144.967423, -37.85..."
1,Armadale,2002,233,30.0,100.0,34.0,19.8,97.0,lauriston girls school,30.0,...,7.744409,92.19939,POLYGON ((145.01167433388778 -37.8535692509816...,['Armadale'],[206061135],POINT (145.02071077926166 -37.8567469622587),7.261986,10138.0,1002.8,"[[145.020491, -37.856724], [145.020572, -37.85..."
2,Carlton North,2002,290,43.0,99.0,31.0,0.0,78.0,,42.583333,...,0.834947,7.270972,POLYGON ((144.95940340045405 -37.7847097769772...,['Carlton North - Princes Hill'],[206071140],POINT (144.96813243979358 -37.78569850451154),2.76995,3171.0,396.3,"[[144.968148, -37.785386], [144.968072, -37.78..."
3,Carlton-Parkville,2002,270,61.0,97.0,31.0,13.6,91.0,university high school,61.0,...,8.240293,58.750247,POLYGON ((144.97476543627707 -37.7986360481138...,"['Carlton', 'Parkville']","[206041117, 206041124]",POINT (144.95658060011763 -37.79124761163832),2.178088,3563.3,447.4,"[[144.957623, -37.791118], [144.957388, -37.78..."
4,CBD-St Kilda Rd,2002,320,9.0,100.0,29.0,5.7,54.0,holmes grammar school,9.0,...,11.796245,85.3394,POLYGON ((144.95234670837857 -37.8151234866836...,"['Melbourne CBD - East', 'Melbourne CBD - Nort...","[206041504, 206041505, 206041503]",POINT (144.96257015435006 -37.813611663354884),0.370739,1420.9,231.6,"[[144.962468, -37.81364], [144.962348, -37.813..."


In [42]:
# Read 'crime.csv'
crime = pd.read_csv('../data/landing/crime.csv')
crime.head()

Unnamed: 0.1,Unnamed: 0,suburbs,year,year_ending,a20_assault_and_related_offences,a50_robbery,"a70_stalking,_harassment_and_threatening_behaviour",a80_dangerous_and_negligent_acts_endangering_people,other_crimes_against_the_person,b10_arson,...,e10_justice_procedures,e20_breaches_of_orders,f20_transport_regulation_offences,f90_miscellaneous_offences,c90_other_drug_offences,f30_other_government_regulatory_offences,d40_public_security_offences,f10_regulatory_driving_offences,b60_bribery,total_crimes
0,0,Albert Park-Middle Park-West St Kilda,2015,June,178.045119,15.312481,89.447606,14.557886,71.608504,5.309056,...,71.89177,210.483706,1.109359,0.321457,0.9142897,10.106054,3.36646,1.075971,0.323363,2586.271942
1,1,Albert Park-Middle Park-West St Kilda,2016,June,163.245015,18.871981,60.409599,12.82973,75.826735,8.872914,...,70.024278,282.063651,5.37795,2.360719,1.157002e-09,5.624394,1.606563,0.4664448,0.161682,2712.910871
2,2,Albert Park-Middle Park-West St Kilda,2017,June,168.857808,21.868907,51.80885,13.412846,77.006862,6.199418,...,67.259923,206.282406,4.132875,0.485045,0.1616815,3.362648,0.323363,0.0,0.008347,2789.178929
3,3,Albert Park-Middle Park-West St Kilda,2018,June,166.026923,20.09119,43.838102,13.86039,90.774467,3.861967,...,62.16807,209.469831,0.351235,2.360719,0.1616815,14.26873,4.813246,1.178672e-08,0.485045,2532.279242
4,4,Albert Park-Middle Park-West St Kilda,2019,June,183.604336,21.44708,43.146675,13.297669,80.959385,3.894443,...,79.682981,237.604554,4.574256,1.426281,0.008347033,1.532163,2.438106,1.571379,0.304763,2518.652829


In [43]:
# Drop the columns 'Unnamed: 0' and 'year_ending'
crime = crime.drop(columns=['Unnamed: 0', 'year_ending'])

# Rename 'year' to 'years'
crime = crime.rename(columns={'year': 'years'})

# Detect missing values
crime.isnull().sum()

suburbs                                                0
years                                                  0
a20_assault_and_related_offences                       0
a50_robbery                                            0
a70_stalking,_harassment_and_threatening_behaviour     0
a80_dangerous_and_negligent_acts_endangering_people    0
other_crimes_against_the_person                        0
b10_arson                                              0
b20_property_damage                                    0
b30_burglary/break_and_enter                           0
b40_theft                                              0
b50_deception                                          0
c10_drug_dealing_and_trafficking                       0
c20_cultivate_or_manufacture_drugs                     0
c30_drug_use_and_possession                            0
d10_weapons_and_explosives_offences                    0
d20_disorderly_and_offensive_conduct                   0
d30_public_nuisance_offences   

In [44]:
# Create the rows from 2002 to 2014 for each suburb
for i in range(2002, 2015):
    for suburb in crime['suburbs'].unique():
        crime = pd.concat([crime, pd.DataFrame({'suburbs': [suburb], 'years': [i]})], ignore_index=True)

# Impute data for combinations of year and suburb from 2002 to 2014 using extrapolation of the each suburb for each column respectively
for column in crime.columns[2:]:
    for suburb in crime['suburbs'].unique():
        crime.loc[crime['suburbs'] == suburb, column] = crime.loc[crime['suburbs'] == suburb, column].interpolate(method='linear', limit_direction='both')

In [45]:
# Join 'yearly_median' and 'crime' on combination of 'suburb' and 'year'
yearly_median = yearly_median.merge(crime, left_on=['suburb', 'year'], right_on=['suburbs', 'years'], how='left')
yearly_median = yearly_median.drop(columns=['suburbs', 'years'])
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,...,e10_justice_procedures,e20_breaches_of_orders,f20_transport_regulation_offences,f90_miscellaneous_offences,c90_other_drug_offences,f30_other_government_regulatory_offences,d40_public_security_offences,f10_regulatory_driving_offences,b60_bribery,total_crimes
0,Albert Park-Middle Park-West St Kilda,2002,300,57.0,99.0,31.0,6.3,83.0,albert park college,57.0,...,60.500537,288.770039,1.282854,1.936367,0.1616815,3.500012,18.91674,0.008347033,0.0,2586.379367
1,Armadale,2002,233,30.0,100.0,34.0,19.8,97.0,lauriston girls school,30.0,...,9.378338,13.968473,1.156995e-08,7.277462e-09,0.0,0.9684718,7.626006e-09,1.819366e-09,0.0,605.270037
2,Carlton North,2002,290,43.0,99.0,31.0,0.0,78.0,,42.583333,...,12.0,19.000002,2.347087e-08,3.387804e-08,7.569259e-09,3.729791e-08,0.0,0.0,0.0,780.000033
3,Carlton-Parkville,2002,270,61.0,97.0,31.0,13.6,91.0,university high school,61.0,...,23.916402,81.41809,0.8683226,1.797014,0.05942399,2.336032,1.79477e-09,0.02376959,0.0,1749.415541
4,CBD-St Kilda Rd,2002,320,9.0,100.0,29.0,5.7,54.0,holmes grammar school,9.0,...,297.999993,445.999991,15.0,10.0,5.0,21.0,2.031652e-07,2.0,0.0,10326.999768


In [46]:
# Read 'land_cover.csv'
land_cover = pd.read_csv('../data/landing/land_cover.csv')
land_cover.head()

Unnamed: 0.1,Unnamed: 0,suburbs,total_area,developed_area,built_percentage,urban_percentage,disturbed_percentage,water_percentage,developed_percentage,nature_percentage,year,quarter
0,0,Albert Park-Middle Park-West St Kilda,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645,2000,1
1,1,Albert Park-Middle Park-West St Kilda,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645,2000,2
2,2,Albert Park-Middle Park-West St Kilda,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645,2000,3
3,3,Albert Park-Middle Park-West St Kilda,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645,2000,4
4,4,Albert Park-Middle Park-West St Kilda,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645,2001,1


In [47]:
# Drop the columns 'Unnamed: 0'
land_cover = land_cover.drop(columns=['Unnamed: 0'])

# Rename 'year' to 'years'
land_cover = land_cover.rename(columns={'year': 'years'})

# Detect missing values
land_cover.isnull().sum()

suburbs                 0
total_area              0
developed_area          0
built_percentage        0
urban_percentage        0
disturbed_percentage    0
water_percentage        0
developed_percentage    0
nature_percentage       0
years                   0
quarter                 0
dtype: int64

In [48]:
# Keep the first quarter of each year
land_cover = land_cover[land_cover['quarter'] == 1]

# Drop the column 'quarter'
land_cover = land_cover.drop(columns=['quarter'])

In [49]:
# Create the rows from 2020 to 2023 for each suburb
for i in range(2020, 2024):
    for suburb in land_cover['suburbs'].unique():
        land_cover = pd.concat([land_cover, pd.DataFrame({'suburbs': [suburb], 'years': [i]})], ignore_index=True)

# Impute data for combinations of year and suburb from 2020 to 2023 using extrapolation of the each suburb for each column respectively
columns_to_interpolate = ['total_area', 'developed_area', 'built_percentage', 'urban_percentage', 'disturbed_percentage', 'water_percentage', 'developed_percentage', 'nature_percentage']
for column in columns_to_interpolate:
    for suburb in land_cover['suburbs'].unique():
        land_cover.loc[land_cover['suburbs'] == suburb, column] = land_cover.loc[land_cover['suburbs'] == suburb, column].interpolate(method='linear', limit_direction='both')

In [50]:
# Join 'yearly_median' and 'land_cover' on combination of 'suburb' and 'year'
yearly_median = yearly_median.merge(land_cover, left_on=['suburb', 'year'], right_on=['suburbs', 'years'], how='left')
yearly_median = yearly_median.drop(columns=['suburbs', 'years'])
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,...,b60_bribery,total_crimes,total_area,developed_area,built_percentage,urban_percentage,disturbed_percentage,water_percentage,developed_percentage,nature_percentage
0,Albert Park-Middle Park-West St Kilda,2002,300,57.0,99.0,31.0,6.3,83.0,albert park college,57.0,...,0.0,2586.379367,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645
1,Armadale,2002,233,30.0,100.0,34.0,19.8,97.0,lauriston girls school,30.0,...,0.0,605.270037,2181875.0,2112500.0,0.026353,0.934403,0.007448,0.0,0.968204,0.031796
2,Carlton North,2002,290,43.0,99.0,31.0,0.0,78.0,,42.583333,...,0.0,780.000033,2305625.0,1977500.0,0.007861,0.843589,0.006235,0.0,0.857685,0.142315
3,Carlton-Parkville,2002,270,61.0,97.0,31.0,13.6,91.0,university high school,61.0,...,0.0,1749.415541,5855625.0,4111875.0,0.066816,0.612766,0.022628,0.0,0.702209,0.297791
4,CBD-St Kilda Rd,2002,320,9.0,100.0,29.0,5.7,54.0,holmes grammar school,9.0,...,0.0,10326.999768,2367500.0,2187500.0,0.579461,0.287223,0.057286,0.007656,0.92397,0.068374


In [51]:
# Save it to a csv file
yearly_median.to_csv('../data/curated/yearly_median_rent_by_suburb.csv', index=False)