# What are the most liveable and affordable suburbs according to your chosen metrics?

In [1]:
# import library
# read model data csv
import pandas as pd

#spark session
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/10/04 15:59:32 WARN Utils: Your hostname, QuzihandeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.96.115 instead (on interface en0)
22/10/04 15:59:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 15:59:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Preprocessing

In [2]:
# properties data
data = pd.read_csv("../data/curated/model-data/model-data.csv").drop(columns = ['Unnamed: 0'])

# aggregated external data by postcode
external = pd.read_csv("../data/curated/model-data/external-features.csv").drop(columns = ['Unnamed: 0'])

# additional data needed
public_service = pd.read_csv("../data/raw/external-data/public_service.csv").drop(columns = ['Unnamed: 0'])
emergency_service = pd.read_csv("../data/raw/external-data/emergency_service.csv").drop(columns = ['Unnamed: 0'])
hospital = pd.read_csv("../data/raw/external-data/hospital.csv").drop(columns = ['Unnamed: 0'])

In [3]:
selected_external_columns = ['postcode', 'public-service-duration', 'care-facility-duration', 'shopping-center-duration',
                             'train-station-duration', 'hospital-duration', 'CBD-duration','emergency-service-duration',
                             'property-count', 'elector-count', 'crime-rate', 'income',
                             'pri-1-10', 'pri-11-50', 'pri-51-100','pri-101-150', 'pri-150+',
                             'sec-1-10', 'sec-11-50', 'sec-51-100','sec-101-150',
                             'sec-150+']

In [4]:
# filter data
data = data.loc[(data['price'] <= 1500) & (data['bedrooms'] <= 3)& (data['parking'] <= 5)]
data = data.loc[(data['price'] <= 2000) & (data['bedrooms'] <= 5)& (data['parking'] <= 5)]

In [5]:
# add rent proportion
data['rent_proportion'] = (data['price'] / data['bedrooms'])/data['income']

In [6]:
# add rent porpotion by postcode
data_mean = data.groupby(['postcode']).mean()
external  = external.merge(data_mean.rent_proportion, left_on="postcode", right_on="postcode", how='outer')

In [7]:
# add external feautre count by postcode
public_service_count = public_service.groupby('postcode').count().geometry.rename('public_service_count')
hospital_count = hospital.groupby('postcode').count().geometry.rename('hospital_count')
emergency_service_count = emergency_service.groupby('postcode').count().geometry.rename('emergency_service_count')

In [8]:
# merge data
external  = external.merge(hospital_count, left_on="postcode", right_on="postcode", how='left')
external  = external.merge(public_service_count, left_on="postcode", right_on="postcode", how='left')
external  = external.merge(emergency_service_count, left_on="postcode", right_on="postcode", how='left')

In [9]:
# check it the property is affordable
data['greater_than_30_percent_count'] = data['rent_proportion'] >= 0.3
affordable_percentage = data.groupby("postcode")["greater_than_30_percent_count"]\
                            .mean()\
                            .rename('unaffordable_percentage')
                            
external  = external.merge(affordable_percentage, left_on="postcode", right_on="postcode", how='left')

In [10]:
# selected columns we interested in based on livability index matrics
livable_columns = ['postcode', 'hospital_count', 'public_service_count', 'emergency_service_count',
                   'CBD-duration', 'train-station-duration', 'rent_proportion', 'income']
external_selected = external[livable_columns]

In [11]:
# replace nan with 0
external_selected['hospital_count'] = external_selected['hospital_count'].fillna(0)
external_selected['public_service_count'] = external_selected['public_service_count'].fillna(0)
external_selected['emergency_service_count'] = external_selected['emergency_service_count'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  external_selected['hospital_count'] = external_selected['hospital_count'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  external_selected['public_service_count'] = external_selected['public_service_count'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  external_selected['emerge

In [12]:
external_selected.sort_values(['rent_proportion'],ascending=False).head(90)

Unnamed: 0,postcode,hospital_count,public_service_count,emergency_service_count,CBD-duration,train-station-duration,rent_proportion,income
69,3053,2.0,3.0,2.0,3.893333,16.598000,0.477835,633.0
108,3004,4.0,3.0,0.0,6.203333,16.598000,0.392007,994.0
110,3000,2.0,10.0,1.0,0.000000,1.701333,0.369239,994.0
124,3940,1.0,0.0,1.0,75.448333,16.598000,0.307927,641.0
176,3008,1.0,2.0,1.0,5.325000,3.836000,0.307030,1219.5
...,...,...,...,...,...,...,...,...
87,3039,1.0,1.0,2.0,12.945000,1.714333,0.159088,1611.0
81,3184,1.0,0.0,0.0,17.041667,16.598000,0.158534,1715.0
130,3204,2.0,1.0,0.0,26.365000,0.985667,0.157826,1538.0
144,3191,2.0,1.0,1.0,30.338333,1.746667,0.157612,1836.0


In [13]:
lst1 = list(external_selected[external_selected['hospital_count'] >= 1]['postcode'])
lst2 = list(external_selected[external_selected['public_service_count']>= 1]['postcode'])
lst7 = list(external_selected[external_selected['emergency_service_count']>= 1]['postcode'])

lst3 = list(external_selected[external_selected['train-station-duration'] <= 20]['postcode'])

lst4_2 = list(external_selected[external_selected['rent_proportion']<= 0.17]['postcode'])
lst4_1 = list(external_selected[external_selected['rent_proportion']<= 0.19]['postcode'])

lst5 = list(external_selected[external_selected['income'] >= 1300]['postcode'])
lst6 = list(external_selected[external_selected['CBD-duration'] <= 20]['postcode'])
lst7 = list(external_selected[external_selected['emergency_service_count']>= 1]['postcode'])

## Top 5 liveable and affordable suburbs

In [14]:
# top 5: 'rent_proportion']<= 0.17
list(set(lst1) & set(lst2) & set(lst3) & set(lst4_2) & set(lst5) & set(lst6) & set(lst7))

[3040, 3079, 3144, 3101, 3039]

## Top 10 liveable and affordable suburbs

In [15]:
# top 10: 'rent_proportion']<= 0.19
list(set(lst1) & set(lst2) & set(lst3)& set(lst4_1) & set(lst5) & set(lst6) & set(lst7))

[3040, 3078, 3079, 3144, 3052, 3056, 3121, 3122, 3101, 3039]