<h1>Affordability</h1>

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '../scripts/')
from helper_functions import convert_census_to_postcode

import geopandas as gpd 
import folium

Obtaining property data

In [2]:
# Reading in median rental price per postcode
median_rent_df = pd.read_csv("../data/curated/median_rental_postcode.csv")
median_rent_df.head()

Unnamed: 0,Postcode,Cost
0,3000,520.0
1,3002,642.5
2,3003,520.0
3,3004,550.0
4,3006,551.5


In [3]:
# Reading in preprocessed property data
property_df = pd.read_csv("../data/curated/properties_processed.csv")

# Removing unecessary columns "Unnamed: 0", "Name", "Coordinates", "Property_Type" & "Agency"
property_df = property_df.drop(columns = ["Unnamed: 0", "Name", "Coordinates", "Property_Type", "Agency"])
property_df.head()

Unnamed: 0,Cost,Bed,Bath,Parking,Postcode
0,440.0,1,1,0,3000
1,620.0,1,1,0,3000
2,300.0,1,1,0,3000
3,400.0,1,1,0,3000
4,625.0,2,2,1,3000


Calculating the average number of facilities

In [4]:
# Calculating average number of facilities per postcode
avg_bed = property_df.groupby(by = "Postcode")["Bed"].mean()
avg_bed.head()

Postcode
3000    1.601399
3002    1.909091
3003    1.796875
3004    1.763441
3006    1.785714
Name: Bed, dtype: float64

In [5]:
avg_bath = property_df.groupby(by = "Postcode")["Bath"].mean()
avg_bath.head()

Postcode
3000    1.300699
3002    1.318182
3003    1.343750
3004    1.494624
3006    1.469388
Name: Bath, dtype: float64

In [6]:
avg_parking = property_df.groupby(by = "Postcode")["Parking"].mean()
avg_parking.head()

Postcode
3000    0.356643
3002    1.000000
3003    0.609375
3004    1.096774
3006    0.647959
Name: Parking, dtype: float64

In [7]:
# Creating dataframe of average facilities per postcode
facilities_df = pd.DataFrame()
facilities_df["Postcode"] = property_df["Postcode"].unique()
facilities_df["Average # Beds"] = avg_bed.to_list()
facilities_df["Average # Baths"] = avg_bath.to_list()
facilities_df["Average # Parking"] = avg_parking.to_list()
facilities_df.head()

Unnamed: 0,Postcode,Average # Beds,Average # Baths,Average # Parking
0,3000,1.601399,1.300699,0.356643
1,3002,1.909091,1.318182,1.0
2,3003,1.796875,1.34375,0.609375
3,3004,1.763441,1.494624,1.096774
4,3006,1.785714,1.469388,0.647959


In [8]:
# Joining median rental price per postcode with average facilities
facilities_df = facilities_df.set_index("Postcode")
price_by_facility_df = median_rent_df.join(facilities_df, on = "Postcode")
price_by_facility_df = price_by_facility_df.set_index("Postcode")
price_by_facility_df.head()

Unnamed: 0_level_0,Cost,Average # Beds,Average # Baths,Average # Parking
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000,520.0,1.601399,1.300699,0.356643
3002,642.5,1.909091,1.318182,1.0
3003,520.0,1.796875,1.34375,0.609375
3004,550.0,1.763441,1.494624,1.096774
3006,551.5,1.785714,1.469388,0.647959


Obtaining census/income data

In [9]:
# Reading in csv files for obtaining census data
sa2_postcode_map = pd.read_csv("../data/curated/sa2_postcode_mapping_2021.csv")
sa2_postcode_map.set_index("sa2_2021", inplace = True)
census_df = pd.read_csv("../data/curated/census_data.csv")

In [10]:
# Using helper function to convert sa2 mapping to postcodes
census_by_postcode_df = convert_census_to_postcode(census_df, sa2_postcode_map, "mean_no_zero")
census_by_postcode_df.head()

Unnamed: 0,postcode_2021,tot_population_11,tot_population_16,tot_population_21,avg_med_mortg_rep_11,avg_med_mortg_rep_16,avg_med_mortg_rep_21,avg_med_person_inc_11,avg_med_person_inc_16,avg_med_person_inc_21,avg_med_rent_16,avg_med_rent_11,avg_med_rent_21,avg_med_hh_inc_16,avg_med_hh_inc_11,avg_med_hh_inc_21,tot_avg_hh_size_16,tot_avg_hh_size_11,tot_avg_hh_size_21
0,3000,124551,167166,178424,2213.38,2040.38,2040.19,862.18,5483.82,6467.76,395.76,447.06,418.19,1482.53,1896.76,2159.41,1.88,1.97,1.86
1,3002,68729,82804,89023,2357.78,2173.67,2155.22,1091.8,8969.6,10432.9,398.0,460.33,449.67,1709.4,2415.0,2598.8,1.82,1.91,1.87
2,3003,15496,20633,23083,2200.0,2050.0,2085.0,701.5,716.0,1000.0,395.0,418.5,385.5,1466.0,1493.5,1751.0,2.15,2.15,1.95
3,3004,100879,123254,129273,2331.58,2155.67,2149.75,1066.08,7152.46,8339.46,391.15,446.83,440.75,1688.85,2270.46,2471.46,1.83,1.89,1.84
4,3006,21150,30239,36164,2477.25,2217.75,2079.0,1132.4,16783.0,19507.0,406.8,501.0,461.0,1637.2,2883.2,3088.8,1.8,1.92,1.92


In [11]:
# Removing unnecessary columns from census data (only need columns containing data from 2021)
census_by_postcode_df = census_by_postcode_df[["postcode_2021", "tot_population_21", "avg_med_mortg_rep_21", "avg_med_person_inc_21", "avg_med_rent_21", 
                            "avg_med_hh_inc_21", "tot_avg_hh_size_21"]]

# Selecting columns required for assessing affordability
census_income_df = census_by_postcode_df[["postcode_2021", "avg_med_mortg_rep_21", "avg_med_person_inc_21"]]
census_income_df.head()

Unnamed: 0,postcode_2021,avg_med_mortg_rep_21,avg_med_person_inc_21
0,3000,2040.19,6467.76
1,3002,2155.22,10432.9
2,3003,2085.0,1000.0
3,3004,2149.75,8339.46
4,3006,2079.0,19507.0


In [12]:
# Renaming postcode column of census dataframe and setting to index
census_income_df = census_income_df.rename({"postcode_2021": "Postcode"}, axis = 1)
census_income_df = census_income_df.set_index("Postcode")
census_income_df.head()

Unnamed: 0_level_0,avg_med_mortg_rep_21,avg_med_person_inc_21
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
3000,2040.19,6467.76
3002,2155.22,10432.9
3003,2085.0,1000.0
3004,2149.75,8339.46
3006,2079.0,19507.0


In [13]:
# Joining census income dataframe to dataframe containg price & facilities information
df = census_income_df.join(price_by_facility_df, on = "Postcode", how = "right")
df.head()

Unnamed: 0_level_0,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3000,2040.19,6467.76,520.0,1.601399,1.300699,0.356643
3002,2155.22,10432.9,642.5,1.909091,1.318182,1.0
3003,2085.0,1000.0,520.0,1.796875,1.34375,0.609375
3004,2149.75,8339.46,550.0,1.763441,1.494624,1.096774
3006,2079.0,19507.0,551.5,1.785714,1.469388,0.647959


Calculating affordability metric

In [14]:
# If the number of facilities is 0 (i.e. for number of bedrooms or number of parking spaces) replace 0 value with arbitrary small value epsilon
EPSILON = 10**(-6)
 
df.loc[df["Average # Beds"] == 0, "Average # Beds"] = EPSILON
df.loc[df["Average # Parking"] == 0, "Average # Parking"] = EPSILON

In [15]:
# Calculating weights for facilities (from previous correlation matrix know positive correlation exists between all 3 attributes and the cost)
bed_weight = df["Average # Beds"].corr(df["Cost"])
bath_weight = df["Average # Baths"].corr(df["Cost"])
parking_weight = df["Average # Parking"].corr(df["Cost"])

In [16]:
# Taking the median rental price divided by the median income (multiply by 100 to get cost as a percentage of income)
df["(Cost / Income)%"] = (df["Cost"] / df["avg_med_person_inc_21"]) * 100
df.head()

Unnamed: 0_level_0,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking,(Cost / Income)%
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3000,2040.19,6467.76,520.0,1.601399,1.300699,0.356643,8.039878
3002,2155.22,10432.9,642.5,1.909091,1.318182,1.0,6.158403
3003,2085.0,1000.0,520.0,1.796875,1.34375,0.609375,52.0
3004,2149.75,8339.46,550.0,1.763441,1.494624,1.096774,6.595151
3006,2079.0,19507.0,551.5,1.785714,1.469388,0.647959,2.82719


In [17]:
# Taking the median mortage divided by the median income (multiply by 100 to get mortage as a percentage of income)
df["(Mortage / Income)%"] = (df["avg_med_mortg_rep_21"] / df["avg_med_person_inc_21"]) * 100
df.head()

Unnamed: 0_level_0,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking,(Cost / Income)%,(Mortage / Income)%
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3000,2040.19,6467.76,520.0,1.601399,1.300699,0.356643,8.039878,31.543997
3002,2155.22,10432.9,642.5,1.909091,1.318182,1.0,6.158403,20.657919
3003,2085.0,1000.0,520.0,1.796875,1.34375,0.609375,52.0,208.5
3004,2149.75,8339.46,550.0,1.763441,1.494624,1.096774,6.595151,25.778048
3006,2079.0,19507.0,551.5,1.785714,1.469388,0.647959,2.82719,10.657713


In [18]:
# Calculating non-stadardised affordability metric
df["Affordability"] = ((bed_weight * df["Average # Beds"]) + (bath_weight * df["Average # Baths"]) + (parking_weight * df["Average # Parking"]) + 
                        df["(Cost / Income)%"] + df["(Mortage / Income)%"])
df.head()

Unnamed: 0_level_0,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking,(Cost / Income)%,(Mortage / Income)%,Affordability
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3000,2040.19,6467.76,520.0,1.601399,1.300699,0.356643,8.039878,31.543997,39.82331
3002,2155.22,10432.9,642.5,1.909091,1.318182,1.0,6.158403,20.657919,27.011048
3003,2085.0,1000.0,520.0,1.796875,1.34375,0.609375,52.0,208.5,260.728967
3004,2149.75,8339.46,550.0,1.763441,1.494624,1.096774,6.595151,25.778048,32.600229
3006,2079.0,19507.0,551.5,1.785714,1.469388,0.647959,2.82719,10.657713,13.738194


In [33]:
# Calculating standardised affordability metric
min_affordability = df.sort_values(by = "Affordability").head(1)["Affordability"].tolist()[0]
#print(min_affordability)
max_affordability = df.sort_values(by = "Affordability", ascending = False).head(1)["Affordability"].tolist()[0]
#print(max_affordability)

df["Standardised Affordability"] = (df["Affordability"] - min_affordability) / (max_affordability - min_affordability)

# Taking 1 - standardised affordability (to make sure metric ranks 0 as least affordable & 1 as most affordable)
#df["Standardised Affordability"] = 1 - df["Standardised Affordability"]

# Printing the top 10 most affordable postcodes
df.sort_values(by = "Affordability").head(10)

Unnamed: 0,Postcode,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking,(Cost / Income)%,(Mortage / Income)%,Affordability,Standardised Affordability
4,3006,2079.0,19507.0,551.5,1.785714,1.469388,0.647959,2.82719,10.657713,13.738194,0.0
1,3002,2155.22,10432.9,642.5,1.909091,1.318182,1.0,6.158403,20.657919,27.011048,0.03164
3,3004,2149.75,8339.46,550.0,1.763441,1.494624,1.096774,6.595151,25.778048,32.600229,0.044963
0,3000,2040.19,6467.76,520.0,1.601399,1.300699,0.356643,8.039878,31.543997,39.82331,0.062182
96,3141,2100.0,1398.33,530.0,1.82,1.265,0.865,37.902355,150.179142,188.275199,0.416061
175,3400,1203.0,766.0,320.0,2.594595,1.27027,1.081081,41.775457,157.049608,198.997589,0.441621
50,3066,2167.0,1338.0,500.0,1.74359,1.179487,0.820513,37.369208,161.958146,199.506549,0.442834
51,3067,2167.0,1354.0,545.0,1.907407,1.240741,0.814815,40.251108,160.044313,200.486695,0.44517
125,3181,2066.0,1266.33,500.0,1.982955,1.232955,0.789773,39.484179,163.148626,202.823508,0.450741
128,3184,2099.5,1257.25,470.0,1.939024,1.170732,0.97561,37.383178,166.99145,204.539171,0.454831


In [32]:
df.sort_values(by = "Affordability", ascending = False).head(10)

Unnamed: 0,Postcode,avg_med_mortg_rep_21,avg_med_person_inc_21,Cost,Average # Beds,Average # Baths,Average # Parking,(Cost / Income)%,(Mortage / Income)%,Affordability,Standardised Affordability
104,3150,2500.0,711.33,580.0,3.411765,1.894118,1.835294,81.537402,351.454318,433.237221,0.0
183,3564,1205.4,647.0,1495.165,2.727273,1.409091,1.681818,231.091963,186.306028,417.556447,0.03738
72,3104,2796.5,820.0,620.0,3.22449,1.77551,1.795918,75.609756,341.036585,416.870867,0.039014
75,3107,2425.0,732.67,625.0,3.470588,2.0,1.647059,85.304434,330.981206,416.566707,0.039739
74,3106,2600.0,767.0,557.5,3.409091,2.090909,1.863636,72.685789,338.983051,411.954587,0.050733
84,3126,3084.0,937.67,775.0,3.148148,1.703704,1.814815,82.651679,328.900359,411.760496,0.051196
58,3075,1746.33,527.0,385.0,2.944444,1.361111,1.694444,73.055028,331.371917,404.572255,0.068331
47,3060,1690.0,515.5,392.5,2.625,1.225,1.825,76.13967,327.837051,404.086417,0.06949
57,3074,1746.33,527.0,380.0,2.906977,1.302326,1.465116,72.106262,331.371917,403.627497,0.070584
87,3129,2796.5,820.0,500.0,2.894737,1.473684,1.447368,60.97561,341.036585,402.199662,0.073987


In [21]:
# Outputting affordability csv 
df = df.reset_index()
affordability_df = df[["Postcode", "Standardised Affordability"]]
affordability_df.to_csv("../data/curated/affordability.csv")

Graphing standardised affordability

In [22]:
# Creating geoJSON file of postcode and geometry coordinates 
sf = gpd.read_file("../data/raw/POA_2021_AUST_GDA2020_SHP/POA_2021_AUST_GDA2020.shp")
postcodes = pd.read_csv("../data/raw/external/postcode.csv", names = ["POA_CODE21", "Name", "Area"])
postcodes["POA_CODE21"] = postcodes["POA_CODE21"].astype(int)

# Converting the geometry shaape to to latitude and longitude
# TAKEN FROM TUTE 2 NOTEBOOK
sf["geometry"] = sf["geometry"].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf = sf[sf["POA_CODE21"].between("3000", "4000")]
sf["POA_CODE21"] = sf["POA_CODE21"].astype(int)



gdf = gpd.GeoDataFrame(
    pd.merge(postcodes, sf, on = "POA_CODE21", how = "inner")
)

geoJSON = gdf[["POA_CODE21", "geometry"]].drop_duplicates("POA_CODE21").to_json()

In [23]:
# (y, x) since we want (lat, long)
gdf["centroid"] = gdf["geometry"].apply(lambda x: (x.centroid.y, x.centroid.x))
gdf[["Name", "POA_CODE21", "centroid"]].head()

affordability_df.rename({"Postcode": "POA_CODE21"})

m = folium.Map(location = [-37.8136, 144.9631], tiles = "Stamen Terrain", zoom_start = 10)

c = folium.Choropleth(
            geo_data = geoJSON, # geoJSON 
            name = "choropleth", # name of plot
            data = affordability_df, # data source
            columns = ["Postcode", "Standardised Affordability"], # the columns required
            key_on = "properties.POA_CODE21", # this is from the geoJSON's properties
            fill_color = "YlOrRd", # color scheme
            nan_fill_color = "grey",
            legend_name = "Affordability"
        )

c.add_to(m)
m.save(f"../plots/affordability_heatmap")

In [24]:
# Creating GeoPandas visualisation of top 10 most affordable postcodes
most_affordable_df = affordability_df.sort_values(by = "Standardised Affordability").head(10)

postcodes["POA_CODE21"] = postcodes["POA_CODE21"].astype(int)
postcodes = postcodes.loc[postcodes["POA_CODE21"].isin(most_affordable_df["Postcode"].to_list())]

In [25]:
gdf = gpd.GeoDataFrame(
    pd.merge(postcodes, sf, on = "POA_CODE21", how = "inner")
)

geoJSON = gdf[["POA_CODE21", "geometry"]].drop_duplicates("POA_CODE21").to_json()

m = folium.Map(location = [-37.8136, 144.9631], tiles = "Stamen Terrain", zoom_start = 10)

m.add_child(folium.Choropleth(geo_data = geoJSON, name = "choropleth",))
m.save(f"../plots/most_affordable_heatmap")