# SA2 districts for features of interest
- add SA2 code 2021 using location data of features of interest instances

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import os
import glob
import re

In [3]:

def add_sa2(data, sf, name):
    """add 2021 SA2 CODE based on the geo location"""
    data = data.dropna(subset=['longitude', 'latitude'])
    #if 
    data['SA2_CODE'] = np.nan
    data = data.astype({'longitude': 'float', 'latitude': 'float'})

    print("Estimated processing size:", len(sf) * len(data))

    # Allocate SA2 code based on the coordinates from rent data
    for index, row in data.iterrows():
        for index_area, row_area in sf.iterrows():
            geo = row_area["geometry"]
            fit = False         
            fit = geo.contains(Point(row["longitude"], row["latitude"]))

            if fit:
                data.loc[index,'SA2_CODE'] = row_area["SA2_CODE21"]
                # print(row['id'], ":", row_area["SA2_CODE21"])
                break

    # Show data loss
    len_data = len(data)
    len_result = len(data.dropna(subset=['SA2_CODE']))
    print("Original size:", len_data, "=> Result size:", len_result)
    print("Loss:", len_data - len_result)

    # Convert SA2 code as Interger
    data = data.dropna(subset=['SA2_CODE'])
    data['SA2_CODE'] = data["SA2_CODE"].astype(int)

    # Export as csv
    data.to_csv("../../data/curated/property_all_with_SA2/"+name+"_property_with_SA2.csv")

In [4]:
# Read SA2-Geolocation data (shape file)
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files
sf = gpd.read_file("../../data/raw/Geo/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")

# Slice the geolocation for Victoria
COL_SF = ["SA2_CODE21", "geometry"]
sf = sf.loc[sf["STE_CODE21"] == '2']
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf = sf[COL_SF]
sf = sf.dropna(subset=['geometry'])
sf["SA2_CODE21"] = sf["SA2_CODE21"].astype(int)
sf

Unnamed: 0,SA2_CODE21,geometry
644,201011001,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,201011006,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,201011007,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
...,...,...
1161,217031476,"MULTIPOLYGON (((143.40263 -38.78152, 143.40252..."
1162,217041477,"POLYGON ((142.41438 -38.09303, 142.41400 -38.0..."
1163,217041478,"MULTIPOLYGON (((142.00870 -38.41715, 142.00876..."
1164,217041479,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."


In [1]:
# this part should be run after running History_prepro.ipynb, files and directory will be deleted after for space saving
path = "../../data/curated/property_all_no_outlier/*.csv"
property_all_lst = []
for fname in glob.glob(path):
    property_all_lst.append(fname)
property_all_lst = sorted(property_all_lst)
property_all_lst


NameError: name 'glob' is not defined

In [41]:
for path in property_all_lst:
    
    regex = r'\d+\w\d+'
    year = re.findall(regex, path)[0]
    print(year)
    # Read Rent Data
    data = pd.read_csv(path)
    #print(data.longitude.sum())
    add_sa2(data, sf, year)
    
    

2006
Estimated processing size: 267264
Original size: 512 => Result size: 512
Loss: 0
2007
Estimated processing size: 2418426
Original size: 4633 => Result size: 4632
Loss: 1
2008
Estimated processing size: 2816712
Original size: 5396 => Result size: 5376
Loss: 20
2009
Estimated processing size: 2650716
Original size: 5078 => Result size: 5058
Loss: 20
2010
Estimated processing size: 3201426
Original size: 6133 => Result size: 6081
Loss: 52
2011
Estimated processing size: 4530960
Original size: 8680 => Result size: 8558
Loss: 122
2012
Estimated processing size: 5698674
Original size: 10917 => Result size: 10830
Loss: 87
2013
Estimated processing size: 6039018
Original size: 11569 => Result size: 11466
Loss: 103
2014
Estimated processing size: 6544836
Original size: 12538 => Result size: 12489
Loss: 49
2015
Estimated processing size: 6748416
Original size: 12928 => Result size: 12782
Loss: 146
2016
Estimated processing size: 7968852
Original size: 15266 => Result size: 15119
Loss: 147
2

In [None]:
import shutil
no_outlier_path = '../../data/curated/property_all_no_outlier'
shutil.rmtree(no_outlier_path)

### Adding SA2 For Facilities (School, Park, Train Station)

In [9]:
parent_path = '../../data/curated/features_of_interst/'

child_path = ['park', 'primary', 'secondary', 'train_station','hospital','market','police','shopping']
pathlst = []
for child in child_path:
    path = str(parent_path)+str(child)+"/*.csv"
    pathlst.append(path)
pathlst

lst = []
for path in pathlst:
    for fname in glob.glob(path):
        lst.append(fname)
lst = sorted(lst)
facility_all_lst = []
for file in lst:
    if 'before' not in file:
        facility_all_lst.append(file)

['../../data/curated/features_of_interst/park/park_2013.csv',
 '../../data/curated/features_of_interst/park/park_2014.csv',
 '../../data/curated/features_of_interst/park/park_2015.csv',
 '../../data/curated/features_of_interst/park/park_2016.csv',
 '../../data/curated/features_of_interst/park/park_2017.csv',
 '../../data/curated/features_of_interst/park/park_2018.csv',
 '../../data/curated/features_of_interst/park/park_2019.csv',
 '../../data/curated/features_of_interst/park/park_2020.csv',
 '../../data/curated/features_of_interst/park/park_2021.csv',
 '../../data/curated/features_of_interst/primary/primary_2013.csv',
 '../../data/curated/features_of_interst/primary/primary_2014.csv',
 '../../data/curated/features_of_interst/primary/primary_2015.csv',
 '../../data/curated/features_of_interst/primary/primary_2016.csv',
 '../../data/curated/features_of_interst/primary/primary_2017.csv',
 '../../data/curated/features_of_interst/primary/primary_2018.csv',
 '../../data/curated/features_of_i

In [4]:

def add_sa2(data, sf, name):
    data = data.dropna(subset=['longitude', 'latitude'])
    #if 
    data['SA2_CODE'] = np.nan
    data = data.astype({'longitude': 'float', 'latitude': 'float'})

    print("Estimated processing size:", len(sf) * len(data))

    # Allocate SA2 code based on the coordinates from rent data
    for index, row in data.iterrows():
        for index_area, row_area in sf.iterrows():
            geo = row_area["geometry"]
            fit = False         
            fit = geo.contains(Point(row["longitude"], row["latitude"]))

            if fit:
                data.loc[index,'SA2_CODE'] = row_area["SA2_CODE21"]
                # print(row['id'], ":", row_area["SA2_CODE21"])
                break

    # Show data loss
    len_data = len(data)
    len_result = len(data.dropna(subset=['SA2_CODE']))
    print("Original size:", len_data, "=> Result size:", len_result)
    print("Loss:", len_data - len_result)

    # Convert SA2 code as Interger
    data = data.dropna(subset=['SA2_CODE'])
    data['SA2_CODE'] = data["SA2_CODE"].astype(int)

    # Export as csv
    data.to_csv("../../data/curated/features_of_interst/"+name+"_with_SA2.csv")

In [16]:
for path in facility_all_lst:
    regex = r'\w+_\d+'
    regex2 = r'\d+'
    name = re.findall(regex, path)[0]
    year = re.findall(regex2, path)[0]
    print(name)
    # Read Rent Data
    data = pd.read_csv(path)
    data = data.rename(columns={'Longitude': 'longitude', 'Latitude':'latitude'})
    data['year'] = year
    data['place_type'] = name[:-5]
    #print(data.longitude.sum())
    add_sa2(data, sf, name)
    

primary_2013
Estimated processing size: 998586
Original size: 1913 => Result size: 1913
Loss: 0
primary_2014
Estimated processing size: 1299258
Original size: 2489 => Result size: 2489
Loss: 0
primary_2015
Estimated processing size: 1300824
Original size: 2492 => Result size: 2492
Loss: 0
primary_2016
Estimated processing size: 1301868
Original size: 2494 => Result size: 2494
Loss: 0
primary_2017
Estimated processing size: 1306566
Original size: 2503 => Result size: 2503
Loss: 0
primary_2018
Estimated processing size: 1308654
Original size: 2507 => Result size: 2507
Loss: 0
primary_2019
Estimated processing size: 1313352
Original size: 2516 => Result size: 2516
Loss: 0
primary_2020
Estimated processing size: 1319094
Original size: 2527 => Result size: 2527
Loss: 0
primary_2021
Estimated processing size: 1329534
Original size: 2547 => Result size: 2547
Loss: 0
secondary_2013
Estimated processing size: 447354
Original size: 857 => Result size: 857
Loss: 0
secondary_2014
Estimated process

In [40]:
added_sa2_path1 = '../../data/curated/features_of_interst/*.csv'
file_lst2 = []
for fname in glob.glob(added_sa2_path1):
    file_lst2.append(fname)
facility_all_with_sa2 = pd.concat([pd.read_csv(f) for f in file_lst2 ])
facility_all_with_sa2.to_csv( "../../data/curated/features_of_interst/place_all_with_sa2.csv", index=False, encoding='utf-8-sig') 

for fname in file_lst2:
    os.remove(fname)

In [None]:
'''import shutil
for path in pathlst:
shutil.rmtree(path)'''

In [None]:
# import folium
# import numpy as np
# 
# # make geometry as JSON type
# geoJSON = sf['geometry'].to_json()

In [None]:
# # Map whole SA2 area
# _map = folium.Map(location=[-37, 144], tiles="Stamen Terrain", zoom_start=10)
# 
# _map.add_child(folium.Choropleth(
#     geo_data=geoJSON,
#     name='SA2 Area',
# ))
# 
# _map.save('../../plots/SA2_Map.html')
# _map

In [None]:
# # Point rent data in the map
# 
# data_s = data.dropna(subset=['latitude', 'longitude'])
# 
# id_data = data_s['id']
# latitude_data = data_s['latitude']
# longitude_data = data_s['longitude']
# 
# # plot points out of geolocation
# for id, lati, long in zip(id_data, latitude_data, longitude_data):
#     _map.add_child(
#         folium.Marker(location=[lati, long], popup=str(id))
#     )
# 
# _map.save('../../plots/rentalData_in_SA2Location.html')
# _map