In [23]:
import pandas as pd
from pyrosm import OSM, get_data
import ssl
from scipy.stats import percentileofscore
import numpy as np
import geopandas as gpd
import folium

ssl._create_default_https_context = ssl._create_unverified_context
property_df = pd.read_csv(
    "/Users/matthewrush/Documents/generic-real-estate-consulting-project-group-32/data/curated/pre_processed_data.csv",
    usecols=[
        "url",
        "postcode",
        "school_duration",
        "school_distance",
        "park_duration",
        "park_distance",
        "shop_duration",
        "shop_distance",
    ],
)
postcode_df = pd.read_csv(
    "../data/raw/postcode.csv",
    usecols=["postcode", "locality"],
)

api_df = pd.read_csv(
    "../data/curated/api_data.csv",
    usecols=[
        "url",
        "train_duration",
        "train_distance",
        "stop_duration",
        "stop_distance",
    ],
)
historical_df = pd.read_csv(
    "../data/curated/historical_sales.csv",
    usecols=["postcode", "n_sold", "median_rent"],
)


In [24]:
# Group into suburbs
property_df = pd.merge(property_df, api_df)
property_df = property_df.groupby("postcode").mean().reset_index()


In [25]:
property_df.head()


Unnamed: 0,postcode,school_duration,school_distance,park_duration,park_distance,shop_duration,shop_distance,train_duration,train_distance,stop_duration,stop_distance
0,3000,1.699359,732.714568,1.307514,541.248468,1.536441,626.146973,1.717971,737.872172,0.762552,307.934291
1,3002,2.056704,883.086667,0.766037,288.96,3.425991,1400.605405,2.03763,867.002222,1.319222,533.097778
2,3003,1.603516,683.605479,0.893009,330.647222,2.842005,1215.64375,1.88473,754.954054,0.678131,260.956757
3,3004,1.439902,598.624706,1.265038,508.938636,3.569911,1799.292,3.365966,1646.661364,1.357235,584.276136
4,3006,2.381386,947.127723,0.921103,323.292647,2.123244,810.188024,3.064376,1419.9867,1.216735,444.431282


In [26]:
def median_rent(df, postcode):
    """Calculate median rent of each suburb based on domain rent data"""
    df = df[df["postcode"] == postcode]
    n_sold = df.n_sold.values.tolist()
    median = df.median_rent.values.tolist()

    weighted_ave = np.nansum([x * y for x, y in zip(n_sold, median)]) / np.nansum(
        n_sold
    )
    return weighted_ave


In [27]:
property_df["median_rent"] = property_df.apply(
    lambda x: median_rent(historical_df, x["postcode"]), axis=1
)
property_df = property_df.where(pd.notnull(property_df["median_rent"]), 0)

property_df = property_df[property_df["median_rent"] != 0]


  weighted_ave = np.nansum([x * y for x, y in zip(n_sold, median)]) / np.nansum(


In [32]:
def score(
    property_df,
    school_duration,
    school_distance,
    park_duration,
    park_distance,
    shop_duration,
    shop_distance,
    train_duration,
    train_distance,
    stop_duration,
    stop_distance,
    median_rent,
    affordable=False,
):
    """Calculate liveability score"""

    if school_duration == 0:
        school_duration_score = 100
    else:
        school_duration_score = percentileofscore(
            property_df.school_duration.dropna(), school_duration
        )
    if school_distance == 0:
        school_distance_score = 100
    else:
        school_distance_score = percentileofscore(
            property_df.school_distance.dropna(), school_distance
        )
    if park_duration == 0:
        park_duration_score = 100
    else:
        park_duration_score = percentileofscore(
            property_df.park_duration.dropna(), park_duration
        )
    if park_distance == 0:
        park_distace_score = 100
    else:
        park_distace_score = percentileofscore(property_df.park_distance.dropna(), park_distance)
    if shop_duration == 0:
        shop_duration_score = 100
    else:
        shop_duration_score = percentileofscore(
            property_df.shop_duration.dropna(), shop_duration
        )
    if shop_distance == 0:
        shop_distance_score = 100
    else:
        shop_distance_score = percentileofscore(
            property_df.shop_distance.dropna(), shop_distance
        )

    if train_duration == 0:
        train_duration_score = 100
    else:
        train_duration_score = percentileofscore(
            property_df.train_duration.dropna(), train_duration
        )
    if train_distance == 0:
        train_distance_score = 100
    else:
        train_distance_score = percentileofscore(
            property_df.train_distance.dropna(), train_distance
        )
    if stop_duration == 0:
        stop_duration_score = 100
    else:
        stop_duration_score = percentileofscore(
            property_df.stop_duration.dropna(), stop_duration
        )
    if stop_distance == 0:
        stop_distance_score = 100
    else:
        stop_distance_score = percentileofscore(
            property_df.stop_distance.dropna(), stop_distance
        )
    if affordable:
        median_rent_score = 4.2 * percentileofscore(
            property_df.median_rent.dropna(), median_rent
        )
    else:
        median_rent_score = 0
    score = (
        school_distance_score
        + park_duration_score
        + park_distace_score
        + shop_duration_score
        + shop_distance_score
        + shop_distance_score
        + train_duration_score
        + train_distance_score
        + stop_duration_score
        + stop_distance_score
        + median_rent_score
    )
    return score


In [33]:
#Calculates the scores based only on liveability

property_df["score"] = property_df.apply(
    lambda x: score(
        property_df,
        x["school_duration"],
        x["school_distance"],
        x["park_duration"],
        x["park_distance"],
        x["shop_duration"],
        x["shop_distance"],
        x["train_duration"],
        x["train_distance"],
        x["stop_duration"],
        x["stop_distance"],
        x["median_rent"],
    ),
    axis=1,
)

property_df = property_df.sort_values("score")
# property_df.to_csv("../data/curated/q3_live.csv")


In [34]:
property_df

Unnamed: 0,postcode,school_duration,school_distance,park_duration,park_distance,shop_duration,shop_distance,train_duration,train_distance,stop_duration,stop_distance,median_rent,score
46,3056,1.359491,676.466667,0.798727,384.364583,2.648714,1380.658268,1.760833,929.433803,0.523928,250.825175,529.145234,95.354432
2,3003,1.603516,683.605479,0.893009,330.647222,2.842005,1215.643750,1.884730,754.954054,0.678131,260.956757,453.409091,97.745593
113,3141,1.380418,661.163347,1.078079,449.739357,1.272043,581.782692,1.922553,875.685714,0.728816,299.569048,555.925926,104.490972
48,3058,1.406087,642.445217,1.176102,481.346610,2.296914,1110.974257,2.478305,1221.613445,0.582641,240.805932,495.157431,129.896982
115,3143,1.721623,830.121053,1.169188,437.061538,2.671869,1377.345455,1.562778,670.125641,0.540256,193.048718,624.411392,134.567666
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,3677,0.400000,200.400000,,,,,5.918333,2836.500000,0.286667,115.300000,390.606061,
225,3690,2.255667,1224.860000,,,,,9.121333,4896.100000,0.975667,467.840000,403.056962,
251,3844,2.477963,1145.866667,96.972593,126488.722222,,,4.300000,2375.733333,1.078889,362.988889,370.133333,
263,3936,,,,,,,,,,,494.963470,


In [35]:
property_df[["postcode", "median_rent"]].head()


Unnamed: 0,postcode,median_rent
46,3056,529.145234
2,3003,453.409091
113,3141,555.925926
48,3058,495.157431
115,3143,624.411392


In [36]:
property_df.score.median()

476.68981685032816

In [37]:
property_df.median_rent.median()

430.67174570669084

In [38]:
#Scores on both affordability and liveability
property_df["affordability_score"] = property_df.apply(
    lambda x: score(
        property_df,
        x["school_duration"],
        x["school_distance"],
        x["park_duration"],
        x["park_distance"],
        x["shop_duration"],
        x["shop_distance"],
        x["train_duration"],
        x["train_distance"],
        x["stop_duration"],
        x["stop_distance"],
        x["median_rent"],
        affordable=True,
    ),
    axis=1,
)

property_df = property_df.sort_values("affordability_score")
property_df[["postcode", "median_rent"]]
# property_df.to_csv("../data/curated/q3_live_afford.csv")


Unnamed: 0,postcode,median_rent
2,3003,453.409091
0,3000,429.475164
59,3072,445.799476
6,3011,433.937785
43,3053,419.141104
...,...,...
224,3677,390.606061
225,3690,403.056962
251,3844,370.133333
263,3936,494.963470


In [41]:
property_df.affordability_score.median()

694.4319901952657

In [40]:
# Visualisation
shape = gpd.read_file(
    "../data/raw/abs_data/zone_data/SA2_2021_AUST_GDA2020.shp")
shape = shape.loc[shape.STE_NAME21 == "Victoria"]
shape = shape.loc[shape.geometry != None]
shape["SA2_CODE21"] = pd.to_numeric(shape["SA2_CODE21"], errors="ignore")


In [16]:
#For purely liveable suburbs
map_data = pd.DataFrame(shape['SA2_NAME21'])
map_data['liveable'] = 0

map_data.loc[
    map_data["SA2_NAME21"].isin(
        [
            "Brunswick - South",
            "West Melbourne - Residential",
            "South Yarra - North",
            "Northcote",
            "Armadale",
        ]
    ),
    "liveable",
] = 1

In [17]:
gdf = gpd.GeoDataFrame(
    shape
)

geoJSON = gdf[['SA2_NAME21', 'geometry']].set_index('SA2_NAME21').to_json()
# geoJSON = gdf[['LocationID', 'geometry', 'zone_x']].drop_duplicates(
#     'LocationID').set_index('zone_x').to_json()

_map = folium.Map(location=[-37.840935,144.946457],
                  tiles="cartodbpositron", zoom_start=11)
# refer to the folium documentations on how to plot aggregated data.
_map.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name="choropleth",
    data=map_data,
    columns=["SA2_NAME21", "liveable"],
    key_on="feature.id",
    fill_opacity=0.7,
    line_opacity=.1,
    fill_color='Blues',))
_map

_map.save("../plots/q3_liveable.html")


In [18]:
# For liveable and affordable suburbs
map_data = pd.DataFrame(shape["SA2_NAME21"])
map_data["liveable"] = 0

map_data.loc[
    map_data["SA2_NAME21"].isin(
        [
            "Melbourne CBD - West",
            "Melbourne CBD - East",
            "West Melbourne - Residential",
            "Noble Park - East",
            "Footscray",
            "St Albans - South",
        ]
    ),
    "liveable",
] = 1


In [19]:
gdf = gpd.GeoDataFrame(
    shape
)

# geoJSON = gdf[['LocationID', 'geometry', 'zone_x']].drop_duplicates(
#     'LocationID').set_index('zone_x').to_json()

_map = folium.Map(location=[-37.840935,144.946457],
                  tiles="cartodbpositron", zoom_start=11)
# refer to the folium documentations on how to plot aggregated data.
_map.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name="choropleth",
    data=map_data,
    columns=["SA2_NAME21", "liveable"],
    key_on="feature.id",
    fill_opacity=0.7,
    line_opacity=.1,
    fill_color='Blues',))
_map

_map.save("../plots/q3_liveable_affordable.html")
