In [4]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import os
import glob
import re

In [14]:
def add_sa2(data, sf, name):
    """add 2016 SA2 CODE based on the geo location"""
    
    data = data.dropna(subset=['longitude_ori', 'latitude_ori'])
    #if 
    data['SA2_CODE_2016'] = np.nan
    data = data.astype({'longitude_ori': 'float', 'latitude_ori': 'float'})

    print("Estimated processing size:", len(sf) * len(data))

    # Allocate SA2 code based on the coordinates from rent data
    for index, row in data.iterrows():
        for index_area, row_area in sf.iterrows():
            geo = row_area["geometry"]
            fit = False         
            fit = geo.contains(Point(row["longitude_ori"], row["latitude_ori"]))

            if fit:
                data.loc[index,'SA2_CODE_2016'] = row_area["SA2_MAIN16"]
                # print(row['id'], ":", row_area["SA2_CODE21"])
                break

    # Show data loss
    len_data = len(data)
    len_result = len(data.dropna(subset=['SA2_CODE_2016']))
    print("Original size:", len_data, "=> Result size:", len_result)
    print("Loss:", len_data - len_result)

    # Convert SA2 code as Interger
    data = data.dropna(subset=['SA2_CODE_2016'])
    data['SA2_CODE'] = data["SA2_CODE"].astype(int)

    # Export as csv
    data.to_csv("../../data/distanceWithSA2/"+name+"_property_with_SA22016.csv")

In [15]:
# Read SA2-Geolocation data (shape file)
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files
sf = gpd.read_file("../../data/raw/Geo/1270055001_mb_2016_vic_shape 2/MB_2016_VIC.shp")
# Slice the geolocation for Victoria
COL_SF = ["SA2_MAIN16", "geometry"]
sf = sf.loc[sf["STE_CODE16"] == '2']
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf = sf[COL_SF]
sf = sf.dropna(subset=['geometry'])
sf["SA2_MAIN16"] = sf["SA2_MAIN16"].astype(int)
sf

Unnamed: 0,SA2_MAIN16,geometry
1,204031069,"POLYGON ((147.14253 -36.69219, 147.14202 -36.6..."
2,204031069,"POLYGON ((146.95935 -36.72779, 146.95966 -36.7..."
3,204031069,"POLYGON ((146.96137 -36.72686, 146.96271 -36.7..."
4,204031069,"POLYGON ((146.96105 -36.72821, 146.96051 -36.7..."
5,204031069,"POLYGON ((146.95908 -36.72687, 146.95925 -36.7..."
...,...,...
85002,213051365,"POLYGON ((144.66607 -37.83603, 144.66389 -37.8..."
85003,213051365,"POLYGON ((144.66607 -37.83603, 144.66666 -37.8..."
85004,213051365,"POLYGON ((144.66762 -37.83618, 144.66778 -37.8..."
85005,213051366,"POLYGON ((144.73143 -37.86130, 144.73142 -37.8..."


In [16]:
# this part should be run after running History_prepro.ipynb, files and directory will be deleted after for space saving
path = "../../data/distance/2013_min_distance.csv"
property_all_lst = []
for fname in glob.glob(path):
    property_all_lst.append(fname)
property_all_lst = sorted(property_all_lst)
property_all_lst


['../../data/distance/2013_min_distance.csv']

In [17]:
for path in property_all_lst:
    
    regex = r'\d+\w\d+'
    year = re.findall(regex, path)[0]
    print(year)
    # Read Rent Data
    data = pd.read_csv(path)
    #print(data.longitude.sum())
    add_sa2(data, sf, year)

2013
Estimated processing size: 2949878212


KeyboardInterrupt: 