In [1]:
import pandas as pd
import geopandas as gpd

In [37]:
import numpy as np
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

def add_sa2(data, sf, name):
    data = data.dropna(subset=['longitude', 'latitude'])
    #if 
    data['SA2_CODE'] = np.nan
    data = data.astype({'longitude': 'float', 'latitude': 'float'})

    print("Estimated processing size:", len(sf) * len(data))

    # Allocate SA2 code based on the coordinates from rent data
    for index, row in data.iterrows():
        for index_area, row_area in sf.iterrows():
            geo = row_area["geometry"]
            fit = False         
            fit = geo.contains(Point(row["longitude"], row["latitude"]))

            if fit:
                data.loc[index,'SA2_CODE'] = row_area["SA2_CODE21"]
                # print(row['id'], ":", row_area["SA2_CODE21"])
                break

    # Show data loss
    len_data = len(data)
    len_result = len(data.dropna(subset=['SA2_CODE']))
    print("Original size:", len_data, "=> Result size:", len_result)
    print("Loss:", len_data - len_result)

    # Convert SA2 code as Interger
    data = data.dropna(subset=['SA2_CODE'])
    data['SA2_CODE'] = data["SA2_CODE"].astype(int)

    # Export as csv
    data.to_csv("../../data/curated/property_all_with_SA2/"+name+"_property_with_SA2.csv")

In [24]:
# Read SA2-Geolocation data (shape file)
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files
sf = gpd.read_file("../../data/raw/Geo/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")

# Slice the geolocation for Victoria
COL_SF = ["SA2_CODE21", "geometry"]
sf = sf.loc[sf["STE_CODE21"] == '2']
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf = sf[COL_SF]
sf = sf.dropna(subset=['geometry'])
sf["SA2_CODE21"] = sf["SA2_CODE21"].astype(int)

In [38]:
import glob
import re
path = "../../data/curated/property_all_no_outlier/*.csv"
property_all_lst = []
for fname in glob.glob(path):
    property_all_lst.append(fname)
property_all_lst = sorted(property_all_lst)
property_all_lst


['../../data/curated/property_all_no_outlier/2006_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2007_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2008_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2009_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2010_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2011_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2012_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2013_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2014_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2015_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2016_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2017_property_no_outlier.csv',
 '../../data/curated/property_all_no_outlier/2018_property_no_outlier.csv',
 '../../data

In [41]:
for path in property_all_lst:
    
    regex = r'\d+\w\d+'
    year = re.findall(regex, path)[0]
    print(year)
    # Read Rent Data
    data = pd.read_csv(path)
    #print(data.longitude.sum())
    add_sa2(data, sf, year)
    
    

2006
Estimated processing size: 267264
Original size: 512 => Result size: 512
Loss: 0
2007
Estimated processing size: 2418426
Original size: 4633 => Result size: 4632
Loss: 1
2008
Estimated processing size: 2816712
Original size: 5396 => Result size: 5376
Loss: 20
2009
Estimated processing size: 2650716
Original size: 5078 => Result size: 5058
Loss: 20
2010
Estimated processing size: 3201426
Original size: 6133 => Result size: 6081
Loss: 52
2011
Estimated processing size: 4530960
Original size: 8680 => Result size: 8558
Loss: 122
2012
Estimated processing size: 5698674
Original size: 10917 => Result size: 10830
Loss: 87
2013
Estimated processing size: 6039018
Original size: 11569 => Result size: 11466
Loss: 103
2014
Estimated processing size: 6544836
Original size: 12538 => Result size: 12489
Loss: 49
2015
Estimated processing size: 6748416
Original size: 12928 => Result size: 12782
Loss: 146
2016
Estimated processing size: 7968852
Original size: 15266 => Result size: 15119
Loss: 147
2

In [None]:
import shutil
no_outlier_path = '../../data/curated/property_all_no_outlier'
shutil.rmtree(no_outlier_path)

In [None]:
# import folium
# import numpy as np
# 
# # make geometry as JSON type
# geoJSON = sf['geometry'].to_json()

In [None]:
# # Map whole SA2 area
# _map = folium.Map(location=[-37, 144], tiles="Stamen Terrain", zoom_start=10)
# 
# _map.add_child(folium.Choropleth(
#     geo_data=geoJSON,
#     name='SA2 Area',
# ))
# 
# _map.save('../../plots/SA2_Map.html')
# _map

In [None]:
# # Point rent data in the map
# 
# data_s = data.dropna(subset=['latitude', 'longitude'])
# 
# id_data = data_s['id']
# latitude_data = data_s['latitude']
# longitude_data = data_s['longitude']
# 
# # plot points out of geolocation
# for id, lati, long in zip(id_data, latitude_data, longitude_data):
#     _map.add_child(
#         folium.Marker(location=[lati, long], popup=str(id))
#     )
# 
# _map.save('../../plots/rentalData_in_SA2Location.html')
# _map