In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import os

In [28]:
new_path = '../../data/curated/min_distance_sa2/'
if not os.path.exists(new_path):
    os.makedirs(new_path)

In [31]:
def add_sa2(data, sf, name):
    """add 2016 SA2 CODE based on the geo location"""
    
    data = data.dropna(subset=['longitude_ori', 'latitude_ori'])
    #if 
    data['SA2_CODE_2016'] = np.nan
    data = data.astype({'longitude_ori': 'float', 'latitude_ori': 'float'})

    print("Estimated processing size:", len(sf) * len(data))

    # Allocate SA2 code based on the coordinates from rent data
    for index, row in data.iterrows():
        for index_area, row_area in sf.iterrows():
            geo = row_area["geometry"]
            fit = False         
            fit = geo.contains(Point(row["longitude_ori"], row["latitude_ori"]))

            if fit:
                data.loc[index,'SA2_CODE_2016'] = row_area["SA2_MAIN16"]
                # print(row['id'], ":", row_area["SA2_CODE21"])
                break

    # Show data loss
    len_data = len(data)
    len_result = len(data.dropna(subset=['SA2_CODE_2016']))
    print("Original size:", len_data, "=> Result size:", len_result)
    print("Loss:", len_data - len_result)

    # Convert SA2 code as Interger
    data = data.dropna(subset=['SA2_CODE_2016'])
    data['SA2_CODE_2016'] = data["SA2_CODE_2016"].astype(int)

    # Export as csv
    data.to_csv(new_path+name+"_property_with_SA2.csv")

In [32]:
# Read SA2-Geolocation data (shape file)
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files
sf = gpd.read_file("../../data/raw/Geo/1270055001_sa2_2016_aust_shape/SA2_2016_AUST.shp")
# Slice the geolocation for Victoria
COL_SF = ["SA2_MAIN16", "geometry"]
sf = sf.loc[sf["STE_CODE16"] == '2']
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf = sf[COL_SF]
sf = sf.dropna(subset=['geometry'])
sf["SA2_MAIN16"] = sf["SA2_MAIN16"].astype(int)
sf

Unnamed: 0,SA2_MAIN16,geometry
578,201011001,"POLYGON ((143.70477 -37.51934, 143.70483 -37.5..."
579,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
580,201011003,"POLYGON ((143.85014 -37.54246, 143.85012 -37.5..."
581,201011004,"POLYGON ((143.82821 -37.57557, 143.82840 -37.5..."
582,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
...,...,...
1035,217031476,"MULTIPOLYGON (((143.40263 -38.78152, 143.40252..."
1036,217041477,"POLYGON ((142.41438 -38.09303, 142.41400 -38.0..."
1037,217041478,"MULTIPOLYGON (((142.00870 -38.41715, 142.00876..."
1038,217041479,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."


In [33]:
path = '../../data/curated/min_distance/'
filename = '_min_distance.csv'

for y in range(2013, 2023):
    rent = pd.read_csv(path+str(y)+filename)
    add_sa2(rent, sf, str(y))

Estimated processing size: 5053818
Original size: 10939 => Result size: 10939
Loss: 0
Estimated processing size: 5584656
Original size: 12088 => Result size: 12088
Loss: 0
Estimated processing size: 5770380
Original size: 12490 => Result size: 12490
Loss: 0
Estimated processing size: 6814962
Original size: 14751 => Result size: 14751
Loss: 0
Estimated processing size: 7718172
Original size: 16706 => Result size: 16706
Loss: 0
Estimated processing size: 8784930
Original size: 19015 => Result size: 19015
Loss: 0
Estimated processing size: 9633162
Original size: 20851 => Result size: 20851
Loss: 0
Estimated processing size: 9423876
Original size: 20398 => Result size: 20398
Loss: 0
Estimated processing size: 11524590
Original size: 24945 => Result size: 24945
Loss: 0
Estimated processing size: 31626672
Original size: 68456 => Result size: 68456
Loss: 0
