## Pre-processing on a Suburb Level
#### Liveability Metrics : Healthcare / Education / Groceries

The necessary  libraries were imported and a Spark session was initiated

In [3]:
# import necessary libraries
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Liveability")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)


your 131072x1 screen size is bogus. expect trouble
24/10/07 20:00:25 WARN Utils: Your hostname, DESKTOP-Q5SP5SI resolves to a loopback address: 127.0.1.1; using 172.20.36.110 instead (on interface eth0)
24/10/07 20:00:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/07 20:00:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/07 20:00:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


The dataset that contained the number of schools, groceries and healthcare was imported.


In [4]:
df = spark.read.parquet('../data/landing0/combined_data_with_geolocation.parquet', header=True, inferSchema=True)
# Print a sample output
df.show()

                                                                                

+--------+------------+---------------+--------+----------+---------+----------------+
|Postcode|School_Count|groceries_Count|hc_Count|  Latitude|Longitude|   Postcode_Name|
+--------+------------+---------------+--------+----------+---------+----------------+
|    3000|          24|             42|       8|-37.815205|144.96394|       Melbourne|
|    3002|          14|             17|      10|-37.816143|144.98045|  East Melbourne|
|    3003|           7|             14|       2| -37.81145| 144.9254|  West Melbourne|
|    3004|           7|             12|       2| -37.83016|144.98045|       Melbourne|
|    3006|          11|             23|       6|-37.824547|144.96394|     South Wharf|
|    3008|           3|              3|       1|-37.817066|144.94191|       Docklands|
|    3010|           1|              1|    NULL|-37.798447| 144.9621|       Parkville|
|    3011|          31|             49|      14| -37.79602|144.90063|       Footscray|
|    3012|           9|             14|    

#### Liveability Metrics : Affordability
The RAI dataframe was also imported to be combined with the previous dataframe

In [5]:
RAI_df = spark.read.parquet('../data/landing/RAI.parquet')

In [6]:
# Print some sample outputs from the dataframe
print((RAI_df))

+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------+-----------------+
|             suburbs|           f1B_RAI|           f2B_RAI|           f3B_RAI|           h2B_RAI|           h3B_RAI|           h4B_RAI|           all_RAI|postcode|__index_level_0__|
+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------+-----------------+
|Albert Park-Middl...| 259.7647058823529|             184.0|128.37209302325581|149.59349593495935| 96.84210526315789| 74.84745762711864|169.84615384615384|    3054|                2|
|            Armadale|256.74418604651163|193.68421052631578|122.66666666666666|142.45161290322582|105.14285714285714| 83.32075471698113|197.14285714285717|    3052|                3|
|       Carlton North|             276.0|200.72727272727272|157.71428571428572|161.16

The column names were renamed to follow snake_case for consistency

In [7]:
df = df.withColumnRenamed("School_Count", "number_of_schools")
df = df.withColumnRenamed("groceries_Count", "number_of_groceries")
df = df.withColumnRenamed("hc_Count", "number_of_healthcare_services")
df = df.withColumnRenamed("Postcode_Name", "suburbs1")
df

Postcode,number_of_schools,number_of_groceries,number_of_healthcare_services,Latitude,Longitude,suburbs1
3000,24,42,8.0,-37.815205,144.96394,Melbourne
3002,14,17,10.0,-37.816143,144.98045,East Melbourne
3003,7,14,2.0,-37.81145,144.9254,West Melbourne
3004,7,12,2.0,-37.83016,144.98045,Melbourne
3006,11,23,6.0,-37.824547,144.96394,South Wharf
3008,3,3,1.0,-37.817066,144.94191,Docklands
3010,1,1,,-37.798447,144.9621,Parkville
3011,31,49,14.0,-37.79602,144.90063,Footscray
3012,9,14,5.0,-37.814625,144.84563,Tottenham
3013,14,21,3.0,-37.819813,144.88138,Yarraville


The two dataframes were joined on their postcodes and duplicate columns were dropped

In [8]:
output = df.join(RAI_df,['Postcode'],"inner")
output1 = output.drop(output['suburbs1'],output['suburbs'],output['latitude'],output['longitude'] )
output1.show()

+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|Postcode|number_of_schools|number_of_groceries|number_of_healthcare_services|           f1B_RAI|           f2B_RAI|           f3B_RAI|           h2B_RAI|           h3B_RAI|           h4B_RAI|           all_RAI|__index_level_0__|
+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|    3054|                9|                 13|                            1| 259.7647058823529|             184.0|128.37209302325581|149.59349593495935| 96.84210526315789| 74.84745762711864|169.84615384615384|                2|
|    3052|               15|                 16|                            8|25

The average rent affordability was calculated according to the porperty type as such : 
- 1 - 2 Bedrooms : (*1 bedroom flats* + *2 bedroom flats* + *3 bedroom flats*) / 3 
- 3+ Bedrooms: (*3 bedroom flats* + *3 bedroom houses* + *4 bedroom houses*) / 3

In [9]:
from pyspark.sql.functions import expr

# Combines all 1 - 2 Bedroom properties together 
output1 = output1.withColumn("1-2_Bed_RAI", expr("(f1B_RAI + f2B_RAI + h2B_RAI) / 3"))

# Combines all 3+ Bedroom properties together 
output1 = output1.withColumn("3+_Bed_RAI", expr("(f3B_RAI + h3B_RAI + h4B_RAI) / 3"))
output1.show()

+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+
|Postcode|number_of_schools|number_of_groceries|number_of_healthcare_services|           f1B_RAI|           f2B_RAI|           f3B_RAI|           h2B_RAI|           h3B_RAI|           h4B_RAI|           all_RAI|__index_level_0__|       1-2_Bed_RAI|        3+_Bed_RAI|
+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+
|    3054|                9|                 13|                            1| 259.7647058823529|             184.0|128.37209302325581|149.59349593495935| 96.84210526315789| 74.84745762711864|169.

In [10]:
# Previous individual RAI columns were dropped for simplicity
output2 = output1.drop( output1['f1B_RAI'], output1['f2B_RAI'],output1['f3B_RAI'] ,
                    output1['h2B_RAI'], output1['h3B_RAI'] ,output1['h4B_RAI'], output1['__index_level_0__'])
output2.show()

+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+
|Postcode|number_of_schools|number_of_groceries|number_of_healthcare_services|           all_RAI|       1-2_Bed_RAI|        3+_Bed_RAI|
+--------+-----------------+-------------------+-----------------------------+------------------+------------------+------------------+
|    3054|                9|                 13|                            1|169.84615384615384|197.78606727243744|100.02055197117745|
|    3052|               15|                 16|                            8|197.14285714285717| 197.6266698253511|103.71009284216831|
|    3000|               24|                 42|                            8|162.35294117647058|212.63171864631718|124.12698412698414|
|    3066|               15|                 29|                            4|193.68421052631578|195.89316239316238|121.71930060779486|
|    3008|                3|                  3|

#### Liveability Metrics : Groceries, Hea
The liveability dataset was imported into a Spark dataframe.
However, as this dataset was combined on a property level, further pre-processing was required. 

In [11]:
df = spark.read.parquet('../data/landing/liveability_data.parquet')
df.show()

24/10/07 20:01:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------+------+---------+---------+--------------------+----+-----+-------+--------------------+---------+----------------------------+------------------------------+-----------------------+-------------------+-----------+---------------------------+----------+-----------------------------------+-------------------------------------+------------+---------------+--------+----------+---------+-------------+------------------------+--------------------+
|postcode|  cost|   suburb|furnished|       property_type|beds|baths|parking|              region|lgaregion|total male population - 2021|total female population - 2021|total population - 2021|australian citizens|median rent|median family weekly income|median age|total region male population - 2022|total region female population - 2022|School_Count|groceries_Count|hc_Count|  Latitude|Longitude|Postcode_Name|distance_to_melbourne_km|   school_per_capita|
+--------+------+---------+---------+--------------------+----+-----+-------+---------

Unnecessary columns of the liveability dataframe was dropped

In [22]:
df1 = df.drop (df['cost'],df['furnished'],df['baths'],df['parking'],df['region'],df['lgaregion'],
                df['total male population - 2021'],df['total female population - 2021'], 
                df['australian citizens'], df['median rent'], df['median family weekly income'], df['median age'],
                df['total region male population - 2022'], df['total region female population - 2022'], df['Postcode_Name'])
df1.show()

+--------+---------+--------------------+----+-----------------------+------------+---------------+--------+----------+---------+------------------------+--------------------+
|postcode|   suburb|       property_type|beds|total population - 2021|School_Count|groceries_Count|hc_Count|  Latitude|Longitude|distance_to_melbourne_km|   school_per_capita|
+--------+---------+--------------------+----+-----------------------+------------+---------------+--------+----------+---------+------------------------+--------------------+
|    3000|melbourne|Apartment / Unit ...| 3.0|                  43084|          24|             42|       8|-37.815205|144.96394|                     0.0|5.570513415653143E-4|
|    3000|melbourne|Apartment / Unit ...| 1.0|                  43084|          24|             42|       8|-37.815205|144.96394|                     0.0|5.570513415653143E-4|
|    3000|melbourne|Apartment / Unit ...| 2.0|                  43084|          24|             42|       8|-37.815205|1

All duplicate rows were removed

In [2]:
df2 = df1.drop_duplicates()
df2.show() 

The property type was converted to lower case for consistency and more accurate string matching

In [None]:
from pyspark.sql.functions import lower
df2 = df2.withColumn("property_type", lower(df["property_type"]))
df2

The number of beds for each entry was calculated and categorised into: 
- 1-2 bedrooms
- 3+ bedrooms

In [27]:
from pyspark.sql import functions as F

# Create the bed_column in sdf1
df3 = df2.withColumn(
    "bed_column",
    F.when((F.col("beds") == 1)  | (F.col("beds") == 2), "1-2_bedders")
    .when(F.col("beds") >= 3, "3+_bedders")
    )
df3


postcode,suburb,property_type,beds,total population - 2021,School_Count,groceries_Count,hc_Count,Latitude,Longitude,distance_to_melbourne_km,school_per_capita,bed_column
3023,caroline-springs,house,4.0,67146,27,43,10,-37.768707,144.75218,19.194206680547108,4.021088374586721...,3+_bedders
3040,aberfeldie,apartment / unit ...,1.0,26722,21,26,8,-37.749302,144.90063,9.014063151891625,7.858693211585959E-4,1-2_bedders
3121,richmond,new house & land,3.0,31534,28,44,15,-37.82035,144.99974,3.30492198518096,8.879304877275322E-4,3+_bedders
3178,rowville,house,3.0,33571,22,37,10,-37.921043,145.2425,27.280554155878445,6.553275148193381E-4,3+_bedders
3195,parkdale,townhouse,4.0,37364,19,32,7,-38.00055,145.10448,24.207028673081517,5.085108660742962E-4,3+_bedders
3220,geelong,house,3.0,17270,23,32,13,-38.15568,144.35219,65.67705759041552,0.001331789229878...,3+_bedders
3936,safety-beach,townhouse,3.0,13366,5,14,2,-38.335217,144.99423,58.06509272423003,3.740834954361814E-4,3+_bedders
3012,west-footscray,house,1.0,27023,9,14,5,-37.814625,144.84563,10.319993141000872,3.330496243940347E-4,1-2_bedders
3175,dandenong,apartment / unit ...,2.0,53545,36,56,19,-38.01917,145.21487,31.78522659549726,6.723316836305911E-4,1-2_bedders
3191,sandringham,house,2.0,10926,20,24,9,-37.953396,145.01352,16.16211129448736,0.001830496064433...,1-2_bedders


In [28]:
# Removed unecessary columns
df4 = df3.drop(df3['property_type'],df3['beds'])

### Livebility Metrics: Groceries, Healthcare per Capita
The number of groceries, healthcare per capita were calculated and added

In [29]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

# Calculate the Groceries per Capita
df_g = df4.select(
    col("postcode"),
    col("groceries_Count"),
    col("total population - 2021"),
    (col("groceries_Count") / col("`total population - 2021`")).alias("groceries_per_capita")
)
new_df3 = df4.join(df_g,on=['postcode','groceries_Count','total population - 2021'], how='inner')

# Calculate the Healthcare per Capita
df_h = df4.select(
    col("postcode"),
    col("hc_Count"),
    col("total population - 2021"),
    (col("hc_Count") / col("`total population - 2021`")).alias("healthcare_per_capita")
)
df_temp = df4.join(df_h,on=['postcode','hc_Count','total population - 2021'], how='inner')
df5 = df_temp.join(df_g,on=['postcode','groceries_Count','total population - 2021'], how='inner') 

Duplicate entries and unnecessary columns are removed

In [None]:
# Remove duplicates from the per capita dataframe 
df_no_duplicates = df5.distinct()
df_no_duplicates

In [31]:
# Remove unnecessary columns from the dataframe with categorised RAi according to property types  
output3 = output2.drop(output2['number_of_schools'],output2['number_of_groceries'],output2['number_of_healthcare_services'])

All the liveability metrics data was combined into one single dataframe and duplicate columns were dropped. 

In [None]:
# The liveability metrics dataframe containg groceries and healthcare per capita  
sdf = df_no_duplicates.join(output3, on=['postcode'], how='inner')
sdf1 = sdf.drop(sdf['hc_Count'],sdf['School_Count'],sdf['groceries_Count'])
sdf1 = sdf1.dropDuplicates(['postcode'])
sdf1

The resulting dataframe was output and saved

In [33]:
# the final dataframe was output into a separate file
sdf1.write.mode("overwrite").parquet('../data/landing/suburb_level_data.parquet')

                                                                                