In [53]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
#Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Liveability")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [90]:
sdf = spark.read.parquet('../data/landing/suburb_level_data.parquet', header=True, inferSchema=True)
sdf.show()

+--------+-----------------------+--------------------+----------+---------+------------------------+--------------------+-----------+---------------------+--------------------+------------------+------------------+------------------+
|postcode|total population - 2021|              suburb|  Latitude|Longitude|distance_to_melbourne_km|   school_per_capita| bed_column|healthcare_per_capita|groceries_per_capita|           all_RAI|       1-2_Bed_RAI|        3+_Bed_RAI|
+--------+-----------------------+--------------------+----------+---------+------------------------+--------------------+-----------+---------------------+--------------------+------------------+------------------+------------------+
|    3175|                  53545|     dandenong-north| -38.01917|145.21487|       31.78522659549726|6.723316836305911E-4|1-2_bedders| 3.548417219161453E-4|0.001045849285647586|208.30188679245282|  275.070707070707|189.98261219156743|
|    3127|                  18608|         mont-albert|  -37

In [98]:
from pyspark.ml.feature import Bucketizer

# Calculate quantile cut points
quantiles = sdf.approxQuantile("school_per_capita", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 0.0)

quantiles = sorted(set(quantiles))

# Add min and max to make splits
splits = [-float('inf')] + quantiles + [float('inf')]

# Create the Bucketizer
bucketizer = Bucketizer(
    splits=splits,
    inputCol="school_per_capita",
    outputCol="school_per_capita_score"
)

quantiles2 = sdf.approxQuantile("groceries_per_capita", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 0.0)
# Add min and max to make splits
splits2 = [-float('inf')] + quantiles2 + [float('inf')]

# Create the Bucketizer
bucketizer2 = Bucketizer(
    splits=splits,
    inputCol="groceries_per_capita",
    outputCol="groceries_per_capita_score"
)

quantiles3 = sdf.approxQuantile("healthcare_per_capita", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 0.0)
# Add min and max to make splits
splits3 = [-float('inf')] + quantiles3 + [float('inf')]

# Create the Bucketizer
bucketizer3 = Bucketizer(
    splits=splits,
    inputCol="healthcare_per_capita",
    outputCol="healthcare_per_capita_score"
)

# Transform the DataFrame to include all 3 caregorical metric
sdf_buckets = bucketizer.transform(sdf)
sdf_buckets2 = bucketizer2.transform(sdf_buckets)
sdf_buckets3 = bucketizer3.transform(sdf_buckets2)

sdf_buckets3.show()

+--------+-----------------------+--------------------+----------+---------+------------------------+--------------------+-----------+---------------------+--------------------+------------------+------------------+------------------+-----------------------+--------------------------+---------------------------+
|postcode|total population - 2021|              suburb|  Latitude|Longitude|distance_to_melbourne_km|   school_per_capita| bed_column|healthcare_per_capita|groceries_per_capita|           all_RAI|       1-2_Bed_RAI|        3+_Bed_RAI|school_per_capita_score|groceries_per_capita_score|healthcare_per_capita_score|
+--------+-----------------------+--------------------+----------+---------+------------------------+--------------------+-----------+---------------------+--------------------+------------------+------------------+------------------+-----------------------+--------------------------+---------------------------+
|    3175|                  53545|     dandenong-north| -3

In [99]:
from pyspark.sql import functions as F
sdf_buckets4 = sdf_buckets3 .withColumn(
    "distance_score",
    F.when((F.col("distance_to_melbourne_km") <= 5), 10)
    .when((F.col("distance_to_melbourne_km") > 5) & (F.col("distance_to_melbourne_km") <= 10),9 )
    .when((F.col("distance_to_melbourne_km") > 10) & (F.col("distance_to_melbourne_km") <= 15), 8)
    .when((F.col("distance_to_melbourne_km") > 15) & (F.col("distance_to_melbourne_km") <= 20), 7)
    .when((F.col("distance_to_melbourne_km") > 20) & (F.col("distance_to_melbourne_km") <= 25), 6)
    .when((F.col("distance_to_melbourne_km") > 25) & (F.col("distance_to_melbourne_km") <= 30), 5)
    .when((F.col("distance_to_melbourne_km") > 30) & (F.col("distance_to_melbourne_km") <= 35), 4)
    .when((F.col("distance_to_melbourne_km") > 35) & (F.col("distance_to_melbourne_km") <= 40), 3)
    .when((F.col("distance_to_melbourne_km") > 40) & (F.col("distance_to_melbourne_km") <= 45), 2)
    .when((F.col("distance_to_melbourne_km") > 50) , 1),
    )
sdf_buckets4

postcode,total population - 2021,suburb,Latitude,Longitude,distance_to_melbourne_km,school_per_capita,bed_column,healthcare_per_capita,groceries_per_capita,all_RAI,1-2_Bed_RAI,3+_Bed_RAI,school_per_capita_score,groceries_per_capita_score,healthcare_per_capita_score,distance_score
3175,53545,dandenong-north,-38.01917,145.21487,31.78522659549726,6.723316836305911E-4,1-2_bedders,3.548417219161453E-4,0.001045849285647586,208.3018867924528,275.070707070707,189.98261219156743,4.0,7.0,0.0,4.0
3127,18608,mont-albert,-37.8259,145.09897,12.012507597259846,8.598452278589854E-4,3+_bedders,4.299226139294927E-4,0.001021066208082545,178.06451612903226,224.87734487734485,162.7308327435716,6.0,7.0,1.0,8.0
3215,21994,north-geelong,-38.108315,144.33578,64.01946054205368,5.001364008365918E-4,3+_bedders,9.09338910611985E-5,7.729380740201874E-4,212.30769230769232,278.87432464176646,195.6873735701938,2.0,5.0,0.0,1.0
3043,17912,gowanbrae,-37.704422,144.87862,14.2316617369889,7.257704332291202E-4,3+_bedders,2.233139794551138...,0.001116569897275...,240.0,300.0418118466899,228.11695906432743,5.0,7.0,0.0,8.0
3550,41839,long-gully,-36.766586,144.29208,130.68153865489094,6.453309113506537E-4,3+_bedders,2.151103037835512...,0.001003848084323239,245.3333333333333,299.3843762145356,225.3535353535353,4.0,7.0,0.0,1.0
3350,66022,lake-wendouree,-37.569107,143.85632,101.1083976660203,3.786616582351337...,1-2_bedders,1.363181969646481...,5.60419254187998E-4,175.23809523809524,233.2525252525253,162.53672942045034,1.0,3.0,0.0,1.0
3220,17270,newtown,-38.15568,144.35219,65.67705759041552,0.001331789229878...,3+_bedders,7.527504342790967E-4,0.001852924145917...,230.0,294.61904761904765,226.98474102729423,8.0,8.0,5.0,1.0
3350,66022,golden-point,-37.569107,143.85632,101.1083976660203,3.786616582351337...,1-2_bedders,1.363181969646481...,5.60419254187998E-4,290.5263157894737,332.95238095238096,254.7658109332528,1.0,3.0,0.0,1.0
3630,32151,shepparton,-36.461567,145.558,159.31845513814582,0.001119716338527573,3+_bedders,5.909614008895524E-4,0.001741780971042...,324.7058823529412,401.2952380952381,292.0352460777993,7.0,8.0,3.0,1.0
3156,38484,upper-ferntree-gully,-37.936802,145.30328,32.85121754048439,5.196964972456086E-4,3+_bedders,2.078785988982434...,7.535599210061325E-4,234.89361702127655,268.6501377410469,199.70090405366,2.0,5.0,0.0,4.0


In [100]:
from pyspark.sql import functions as F
sdf_buckets5 = sdf_buckets4.withColumn(
    "all_RAI_score",
    F.when((F.col("all_RAI") <= 50), 0)
    .when((F.col("all_RAI") > 50) & (F.col("all_RAI") <= 75), 1) 
    .when((F.col("all_RAI") > 75) & (F.col("all_RAI") <= 100), 2)
    .when((F.col("all_RAI") > 100) & (F.col("all_RAI") <= 115), 3)
    .when((F.col("all_RAI") > 115) & (F.col("all_RAI") <= 130), 4)
    .when((F.col("all_RAI") > 130) & (F.col("all_RAI") <= 145), 5)
    .when((F.col("all_RAI") > 145) & (F.col("all_RAI") <= 160), 6)
    .when((F.col("all_RAI") > 160) & (F.col("all_RAI") <= 175), 7)
    .when((F.col("all_RAI") > 150) & (F.col("all_RAI") <= 175), 8)
    .when((F.col("all_RAI") > 175) & (F.col("all_RAI") <= 200), 9)
    .when((F.col("all_RAI") > 200), 10),
    )

sdf_buckets6 = sdf_buckets5.withColumn(
    "1-2_Bed_RAI_score",
    F.when((F.col("1-2_Bed_RAI") <= 50), 5)
    .when((F.col("1-2_Bed_RAI") > 50) & (F.col("1-2_Bed_RAI") <= 100), 4)
    .when((F.col("1-2_Bed_RAI") > 100) & (F.col("1-2_Bed_RAI") <= 150), 3)
    .when((F.col("1-2_Bed_RAI") > 150) & (F.col("1-2_Bed_RAI") <= 200), 2)
    .when((F.col("1-2_Bed_RAI") > 200), 1),
    )
sdf_buckets7 = sdf_buckets6.withColumn(
    "3+_Bed_RAI_score",
    F.when((F.col("3+_Bed_RAI") <= 50), 5)
    .when((F.col("3+_Bed_RAI") > 50) & (F.col("3+_Bed_RAI") <= 100), 4)
    .when((F.col("3+_Bed_RAI") > 100) & (F.col("3+_Bed_RAI") <= 150), 3)
    .when((F.col("3+_Bed_RAI") > 150) & (F.col("3+_Bed_RAI") <= 200), 2)
    .when((F.col("3+_Bed_RAI") > 200), 1),
    )
sdf_buckets7

postcode,total population - 2021,suburb,Latitude,Longitude,distance_to_melbourne_km,school_per_capita,bed_column,healthcare_per_capita,groceries_per_capita,all_RAI,1-2_Bed_RAI,3+_Bed_RAI,school_per_capita_score,groceries_per_capita_score,healthcare_per_capita_score,distance_score,all_RAI_score,1-2_Bed_RAI_score,3+_Bed_RAI_score
3175,53545,dandenong-north,-38.01917,145.21487,31.78522659549726,6.723316836305911E-4,1-2_bedders,3.548417219161453E-4,0.001045849285647586,208.3018867924528,275.070707070707,189.98261219156743,4.0,7.0,0.0,4.0,10,1,2
3127,18608,mont-albert,-37.8259,145.09897,12.012507597259846,8.598452278589854E-4,3+_bedders,4.299226139294927E-4,0.001021066208082545,178.06451612903226,224.87734487734485,162.7308327435716,6.0,7.0,1.0,8.0,9,1,2
3215,21994,north-geelong,-38.108315,144.33578,64.01946054205368,5.001364008365918E-4,3+_bedders,9.09338910611985E-5,7.729380740201874E-4,212.30769230769232,278.87432464176646,195.6873735701938,2.0,5.0,0.0,1.0,10,1,2
3043,17912,gowanbrae,-37.704422,144.87862,14.2316617369889,7.257704332291202E-4,3+_bedders,2.233139794551138...,0.001116569897275...,240.0,300.0418118466899,228.11695906432743,5.0,7.0,0.0,8.0,10,1,1
3550,41839,long-gully,-36.766586,144.29208,130.68153865489094,6.453309113506537E-4,3+_bedders,2.151103037835512...,0.001003848084323239,245.3333333333333,299.3843762145356,225.3535353535353,4.0,7.0,0.0,1.0,10,1,1
3350,66022,lake-wendouree,-37.569107,143.85632,101.1083976660203,3.786616582351337...,1-2_bedders,1.363181969646481...,5.60419254187998E-4,175.23809523809524,233.2525252525253,162.53672942045034,1.0,3.0,0.0,1.0,9,1,2
3220,17270,newtown,-38.15568,144.35219,65.67705759041552,0.001331789229878...,3+_bedders,7.527504342790967E-4,0.001852924145917...,230.0,294.61904761904765,226.98474102729423,8.0,8.0,5.0,1.0,10,1,1
3350,66022,golden-point,-37.569107,143.85632,101.1083976660203,3.786616582351337...,1-2_bedders,1.363181969646481...,5.60419254187998E-4,290.5263157894737,332.95238095238096,254.7658109332528,1.0,3.0,0.0,1.0,10,1,1
3630,32151,shepparton,-36.461567,145.558,159.31845513814582,0.001119716338527573,3+_bedders,5.909614008895524E-4,0.001741780971042...,324.7058823529412,401.2952380952381,292.0352460777993,7.0,8.0,3.0,1.0,10,1,1
3156,38484,upper-ferntree-gully,-37.936802,145.30328,32.85121754048439,5.196964972456086E-4,3+_bedders,2.078785988982434...,7.535599210061325E-4,234.89361702127655,268.6501377410469,199.70090405366,2.0,5.0,0.0,4.0,10,1,2


In [101]:
sdf1 = sdf_buckets7.filter(sdf['bed_column'] == '1-2_bedders')
sdf1 = sdf1.distinct()
sdf2 = sdf_buckets7.filter(sdf['bed_column'] == '3+_bedders')
sdf2 = sdf2.distinct()
sdf_1a = sdf1.drop(sdf1['3+_Bed_RAI'], sdf1['bed_column'], sdf1['groceries_per_capita'], sdf1['school_per_capita'], sdf1['healthcare_per_capita'],
        sdf1['total population - 2021'], sdf1['Latitude'], sdf1['Longitude'], sdf1['distance_to_melbourne_km']) 
sdf_2a = sdf2.drop(sdf2['1-2_Bed_RAI'], sdf2['bed_column'], sdf2['groceries_per_capita'], sdf2['school_per_capita'], sdf2['healthcare_per_capita'],
        sdf2['total population - 2021'], sdf2['Latitude'], sdf2['Longitude'], sdf2['distance_to_melbourne_km'])
sdf_1a

postcode,suburb,all_RAI,1-2_Bed_RAI,school_per_capita_score,groceries_per_capita_score,healthcare_per_capita_score,distance_score,all_RAI_score,1-2_Bed_RAI_score,3+_Bed_RAI_score
3020,sunshine-north,225.3061224489796,224.81333661821463,5.0,7.0,0.0,8,10,1,2
3108,doncaster,162.35294117647058,219.45609945609945,1.0,5.0,0.0,8,7,1,3
3550,white-hills,245.3333333333333,276.3982683982684,4.0,7.0,0.0,1,10,1,1
3350,alfredton,175.23809523809524,233.2525252525253,1.0,3.0,0.0,1,9,1,2
3020,sunshine,225.3061224489796,224.81333661821463,5.0,7.0,0.0,8,10,1,2
3155,boronia,184.0,213.7677337677338,5.0,8.0,1.0,5,9,1,3
3175,dandenong,210.28571428571428,265.77777777777777,4.0,7.0,0.0,4,10,1,2
3186,brighton,187.11864406779665,220.24242424242425,7.0,8.0,3.0,8,9,1,2
3043,tullamarine,240.0,300.0418118466899,5.0,7.0,0.0,8,10,1,1
3450,castlemaine,262.85714285714283,369.9913419913421,8.0,8.0,7.0,1,10,1,1


### Liveability without weightages 

In [124]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

df_all = sdf_1a.select(
    col("postcode"),
    col("suburb"),
    col("all_RAI_score"),
    col("school_per_capita_score"),
    col("groceries_per_capita_score"),
    col("healthcare_per_capita_score"),
    col("distance_score"),
    (2*col("all_RAI_score") + 2*col("school_per_capita_score")+  2*col("groceries_per_capita_score")
    +  2*col("healthcare_per_capita_score")+  2*col("distance_score")).alias("liveablity_score_all")
)
df_all = df_all.drop(df_all['all_RAI_score'],df_all['school_per_capita_score'], df_all['groceries_per_capita_score'],
        df_all['healthcare_per_capita_score'],df_all['distance_score'])

df_1_2B = sdf_1a.select(
    col("postcode"),
    col("suburb"),
    col("1-2_Bed_RAI_score"),
    col("school_per_capita_score"),
    col("groceries_per_capita_score"),
    col("healthcare_per_capita_score"),
    col("distance_score"),
    (4*col("1-2_Bed_RAI_score") +  1*col("school_per_capita_score")+  2*col("groceries_per_capita_score")
    +  1*col("healthcare_per_capita_score")+  2*col("distance_score")).alias("liveablity_score_1_2Bedder")
)

df_1_2B = df_1_2B.drop(df_1_2B['1-2_Bed_RAI_score'],df_1_2B['school_per_capita_score'], df_1_2B['groceries_per_capita_score'],
        df_1_2B['healthcare_per_capita_score'],df_1_2B['distance_score'])

df_1_2B

postcode,suburb,liveablity_score_1_2Bedder
3020,sunshine-north,39.0
3108,doncaster,31.0
3550,white-hills,24.0
3350,alfredton,13.0
3020,sunshine,39.0
3155,boronia,36.0
3175,dandenong,30.0
3186,brighton,46.0
3043,tullamarine,39.0
3450,castlemaine,37.0


In [126]:

df_3B = sdf_2a.select(
    col("postcode"),
    col("suburb"),
    col("3+_Bed_RAI_score"),
    col("school_per_capita_score"),
    col("groceries_per_capita_score"),
    col("healthcare_per_capita_score"),
    col("distance_score"),
    (4*col("3+_Bed_RAI_score") +  4*col("school_per_capita_score")+ 2*col("groceries_per_capita_score")
    +  1*col("healthcare_per_capita_score")+ 1*col("distance_score")).alias("liveablity_score_3+Bedder")
)

df_3B = df_3B.drop(df_3B['3+_Bed_RAI_score'],df_3B['school_per_capita_score'], df_3B['groceries_per_capita_score'],
        df_3B['healthcare_per_capita_score'],df_3B['distance_score'])

In [121]:
#df_all = df_all.join(df_1_2B, on='postcode', how='outer').join(df_3B,on='postcode',how='outer')
#df_all


postcode,suburb,liveablity_score_all,liveablity_score_1_2Bedder,liveablity_score_3+Bedder
3000,melbourne,54.0,41.0,48.0
3002,east-melbourne,86.0,60.0,
3006,southbank,56.0,44.0,
3008,docklands,34.0,,
3011,footscray,74.0,49.0,72.0
3011,footscray,74.0,49.0,72.0
3011,footscray,74.0,49.0,72.0
3011,footscray,74.0,49.0,72.0
3011,seddon,74.0,49.0,72.0
3011,seddon,74.0,49.0,72.0


### Most Liveable Suburbs Across All Property Types

In [127]:
from pyspark.sql.functions import col, desc
sorted_df_all = df_all.orderBy(desc("liveablity_score_all"))
sorted_df_all.show()

+--------+---------------+--------------------+
|postcode|         suburb|liveablity_score_all|
+--------+---------------+--------------------+
|    3002| east-melbourne|                86.0|
|    3047|   broadmeadows|                76.0|
|    3168|        clayton|                76.0|
|    3168|   notting-hill|                76.0|
|    3039|   moonee-ponds|                76.0|
|    3011|      footscray|                74.0|
|    3144|        kooyong|                74.0|
|    3011|         seddon|                74.0|
|    3171|     springvale|                74.0|
|    3205|south-melbourne|                72.0|
|    3300|       hamilton|                70.0|
|    3181|        windsor|                70.0|
|    3181|   prahran-east|                70.0|
|    3181|        prahran|                70.0|
|    3186|       brighton|                70.0|
|    3450|    castlemaine|                68.0|
|    3016|   williamstown|                68.0|
|    3585|      swan-hill|              

### Most Liveable Suburbs for 1 - 2 Bedders

In [129]:
sorted_df_1_2B = df_1_2B.orderBy(desc('liveablity_score_1_2Bedder'))
sorted_df_1_2B.show()

+--------+---------------+--------------------------+
|postcode|         suburb|liveablity_score_1_2Bedder|
+--------+---------------+--------------------------+
|    3002| east-melbourne|                      60.0|
|    3205|south-melbourne|                      53.0|
|    3121|       richmond|                      52.0|
|    3121|        burnley|                      52.0|
|    3121|       cremorne|                      52.0|
|    3181|        windsor|                      51.0|
|    3181|   prahran-east|                      51.0|
|    3181|        prahran|                      51.0|
|    3031|     flemington|                      50.0|
|    3031|     kensington|                      50.0|
|    3011|      footscray|                      49.0|
|    3144|        kooyong|                      49.0|
|    3039|   moonee-ponds|                      49.0|
|    3011|         seddon|                      49.0|
|    3168|   notting-hill|                      48.0|
|    3168|        clayton|  

#### Most Liveable Suburb for 3 Bedders

In [130]:
sorted_df_3B = df_3B.orderBy(desc('liveablity_score_3+Bedder'))
sorted_df_3B.show()

+--------+---------------+-------------------------+
|postcode|         suburb|liveablity_score_3+Bedder|
+--------+---------------+-------------------------+
|    3168|        clayton|                     73.0|
|    3168|   notting-hill|                     73.0|
|    3011|      footscray|                     72.0|
|    3011|         seddon|                     72.0|
|    3144|        kooyong|                     69.0|
|    3205|south-melbourne|                     68.0|
|    3039|   moonee-ponds|                     65.0|
|    3181|        windsor|                     64.0|
|    3181|        prahran|                     64.0|
|    3121|        burnley|                     64.0|
|    3121|       richmond|                     64.0|
|    3194|        mentone|                     64.0|
|    3047|   broadmeadows|                     64.0|
|    3047|         jacana|                     64.0|
|    3047|         dallas|                     64.0|
|    3036|         keilor|                    