## Assign Suburbs to each property 
This notebook assigns suburbs to a property based on its longitude and latitude 

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
import geopandas as gpd

In [2]:
# read our data
df = pd.read_csv("../data/raw/full_property_data.csv")

# sf stands for shape file
australia_sf = gpd.read_file("../data/raw/shapefiles/Statistical_area_level2/SA2_2021_AUST_GDA2020.shp")
australia_sf.head(3)

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21899 -35.36738, 149.21800 -35.3..."
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."


In [3]:
# seperate suburbs that are in Victoria
vic_sf = australia_sf[australia_sf['STE_NAME21'] == 'Victoria']

# drop the null location ID values
vic_sf.dropna(inplace=True)
vic_sf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vic_sf.dropna(inplace=True)


Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
644,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,52.7109,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,12.3787,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,51.5855,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,34.1607,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,104.7274,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."


In [4]:
# this function converts two floats to a point that is recognised by Shapely
def convert_to_point(row):
    x = float(row["latitude"])
    y = float(row["longitude"])
    return Point(y, x)

# this function loops through the victorian suburbs and tries to locate which zone a 
# property is in, based on its coordinates
def find_zone(point): 
    temp_df = vic_sf[['SA2_CODE21', "SA2_NAME21", 'geometry']].reset_index()

    for index, row in temp_df.iterrows():
        if point.within(row['geometry']):
            return row["SA2_CODE21"]
    return None

In [5]:
# apply our functions to locate the relevant suburb for each house.
df['point'] = df.apply (lambda row: convert_to_point(row), axis=1)
df["LocID"] = df.apply (lambda row: find_zone(row["point"]), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [6]:
# check what columns we no longer need; remove them
unwanted_columns = ["latitude", "longitude", "point", "Unnamed: 0"]
df.drop(columns=unwanted_columns, inplace=True)

# remove any rows that still do not have a location associated with it
df.dropna(subset="LocID",inplace=True)


In [7]:
# save our data
filename = "../data/raw/full_property_zones.csv"
df.to_csv(filename)

In [8]:
# check our counts
df.count()

index            11002
name             11002
cost_text        11002
Bed              11002
Bath             11002
Park             11002
property_type    11002
desc_head        10982
LocID            11002
dtype: int64