In [85]:
# imports
import pandas as pd

In [86]:
# read datasets
property_df = pd.read_csv("../data/raw/full_property_zones.csv")
income_and_pop = pd.read_csv("../data/raw/income_and_pop.csv")

In [87]:
df = property_df.merge(income_and_pop, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# shouldn't we do a left join instead of full outer join?
# we only care about the values in the property data, so the possible outcomes should be
# property data with income/pop data, or
# proprty data without income/pop data
# (we don't care about income/pop data without property data)

In [88]:
import geopandas as gpd

df_centroid = income_and_pop.copy()

# replace object with shapely geometries
df_centroid = df_centroid.dropna(subset=["geometry"])
df_centroid['geometry'] = gpd.GeoSeries.from_wkt(df_centroid['geometry'])

df_centroid = gpd.GeoDataFrame(df_centroid)

In [89]:
# get centroids
df_centroid['centroid'] = df_centroid['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
df_centroid.head(1)

Unnamed: 0,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,201011001,865,370,2218,1952,13320,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",Alfredton,"(-37.54173636281507, 143.749330252453)"


Need the coordinate data for each property in order for them to be linked to the nearest suburb based on its centroid.

In [90]:
property_df2 = pd.read_csv("../data/raw/full_property_data.csv")
property_df2.head(1)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,index,name,cost_text,Bed,Bath,Park,property_type,desc_head
0,0,-37.865018,144.974682,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar


This one!

In [91]:
property_df = pd.merge(property_df, property_df2[['index', 'longitude', 'latitude']], on='index', how='left')

In [92]:
df = property_df.merge(df_centroid, how='left', left_on='LocID', right_on='SA2_CODE_2021')
df.head(3)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514,...,-37.865018,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,"(-37.861344917872316, 144.9762676334905)"
1,1,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,4500.0,3,2,2,Apartment / Unit / Flat,arguabl captiv penthous,206041503,...,-37.814172,206041503.0,884.0,375.0,2248.0,1511.0,10533.0,"POLYGON ((144.96534 -37.81137, 144.96576 -37.8...",Melbourne CBD - East,"(-37.81278701743976, 144.96859287374974)"
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,,,


In [93]:
df_missing = df.copy()  # dataframe with missing SA2 code only
df_missing = df_missing[df_missing["SA2_CODE_2021"].isna()]

# create POINT() geometry for the longitude and latitude of each property
df_missing = gpd.GeoDataFrame(
    df_missing, geometry=gpd.points_from_xy(df_missing.longitude, df_missing.latitude))
df_missing.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,POINT (144.95690 -37.83012),,


In [None]:
# now that we have the location coordinate of each property, impute all the missing values with the closest centroid
# 1: find the difference between df_missing["geometry"] and every single value in df_centroid["centroid"]
# 2: we want to get the smallest difference
# 3: once found, the SA2 code of that centroid, along with the rest of the values in df_centroid, will be imputed to df
# 4: voila