In [68]:
# imports
import geopandas as gpd
import pandas as pd

In [69]:
# read datasets
property_df = pd.read_csv("../data/raw/full_property_zones.csv")
income_and_pop = pd.read_csv("../data/raw/income_and_pop.csv")
australia_sf = gpd.read_file("../data/raw/shapefiles/Statistical_area_level2/SA2_2021_AUST_GDA2020.shp")
# seperate suburbs that are in Victoria
vic_sf = australia_sf[australia_sf['STE_NAME21'] == 'Victoria']

# drop the null location ID values
vic_sf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vic_sf.dropna(inplace=True)


In [70]:
df = property_df.merge(income_and_pop, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# shouldn't we do a left join instead of full outer join?
# we only care about the values in the property data, so the possible outcomes should be
# property data with income/pop data, or
# proprty data without income/pop data
# (we don't care about income/pop data without property data)

property_df.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514


In [71]:


df_centroid = income_and_pop.copy()

# replace object with shapely geometries
df_centroid = df_centroid.dropna(subset=["geometry"])
df_centroid['geometry'] = gpd.GeoSeries.from_wkt(df_centroid['geometry'])

df_centroid = gpd.GeoDataFrame(df_centroid)

In [72]:
# get centroids
df_centroid['centroid'] = df_centroid['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
df_centroid.head(1)

Unnamed: 0,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,201011001,865,370,2218,1952,13320,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",Alfredton,"(-37.54173636281507, 143.749330252453)"


Need the coordinate data for each property in order for them to be linked to the nearest suburb based on its centroid.

In [73]:
property_df2 = pd.read_csv("../data/raw/full_property_data.csv")
property_df2.head(1)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,index,name,cost_text,Bed,Bath,Park,property_type,desc_head
0,0,-37.865018,144.974682,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar


This one!

In [74]:
property_df = pd.merge(property_df, property_df2[['index', 'longitude', 'latitude']], on='index', how='left')

In [75]:
df = property_df.merge(df_centroid, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# check what columns we no longer need; remove them
#unwanted_columns = ["Unnamed: 0"]
#df.drop(columns=unwanted_columns, inplace=True)

df.head(3)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514,...,-37.865018,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,"(-37.861344917872316, 144.9762676334905)"
1,1,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,4500.0,3,2,2,Apartment / Unit / Flat,arguabl captiv penthous,206041503,...,-37.814172,206041503.0,884.0,375.0,2248.0,1511.0,10533.0,"POLYGON ((144.96534 -37.81137, 144.96576 -37.8...",Melbourne CBD - East,"(-37.81278701743976, 144.96859287374974)"
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,,,


In [76]:
df_missing = df.copy()  # dataframe with missing SA2 code only
df_missing = df_missing[df_missing["SA2_CODE_2021"].isna()]

# create POINT() geometry for the longitude and latitude of each property
df_missing = gpd.GeoDataFrame(
    df_missing, geometry=gpd.points_from_xy(df_missing.longitude, df_missing.latitude))
df_missing.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,POINT (144.95690 -37.83012),,


In [77]:
# now that we have the location coordinate of each property, impute all the missing values with the closest centroid
# 1: find the difference between df_missing["geometry"] and every single value in df_centroid["centroid"]
# 2: we want to get the smallest difference
# 3: once found, the SA2 code of that centroid, along with the rest of the values in df_centroid, will be imputed to df
# 4: voila

In [78]:
from shapely.geometry import Point, Polygon
# this function converts two floats to a point that is recognised by Shapely
def convert_to_point(row):
    x = float(row["latitude"])
    y = float(row["longitude"])
    return Point(y, x)

In [79]:
df_centroid['point'] = df.apply (lambda row: convert_to_point(row), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [80]:
s = gpd.GeoSeries([*df_centroid["point"]])
s

0      POINT (144.97468 -37.86502)
1      POINT (144.97400 -37.81417)
2      POINT (144.95690 -37.83012)
3      POINT (145.05209 -37.83108)
4      POINT (144.93824 -37.82240)
                  ...             
466    POINT (144.93458 -37.74056)
467    POINT (145.08085 -37.92607)
468    POINT (145.01910 -37.90366)
469    POINT (144.95291 -37.77635)
470    POINT (145.07589 -37.82312)
Length: 471, dtype: geometry

In [81]:
df_missing["geometry"].head(3)

2    POINT (144.95690 -37.83012)
5    POINT (145.03114 -37.85349)
6    POINT (145.03689 -37.81553)
Name: geometry, dtype: geometry

In [82]:
# this does not work.
s.distance(df_missing['geometry'].head(1)).dropna()

  warn("The indices of the two GeoSeries are different.")


2    0.0
dtype: float64

In [83]:
dist_mtrx = s.geometry.apply(lambda g: df_missing['geometry'].distance(g))

In [84]:
dist_mtrx.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,461,462,463,464,465,466,467,468,469,470
2,0.039168,0.023380,0.000000,0.095195,0.020200,0.077829,0.081308,0.077338,0.092545,0.072266,...,0.120668,0.059464,0.072447,0.038703,0.056613,0.092297,0.156747,0.096312,0.053913,0.119193
5,0.057625,0.069356,0.077829,0.030676,0.097969,0.000000,0.038393,0.051929,0.076780,0.061439,...,0.076372,0.072515,0.025769,0.059729,0.043052,0.148583,0.087974,0.051595,0.109864,0.054080
6,0.079494,0.062902,0.081308,0.021748,0.098894,0.038393,0.000000,0.016099,0.112686,0.094777,...,0.114698,0.046424,0.014410,0.047149,0.025386,0.126839,0.118965,0.089908,0.092670,0.039732
8,0.053907,0.103994,0.092545,0.107178,0.107181,0.076780,0.112686,0.122472,0.000000,0.021299,...,0.038932,0.129286,0.098372,0.107061,0.104625,0.184604,0.093003,0.033663,0.145203,0.128551
9,0.033187,0.082753,0.072266,0.092083,0.088092,0.061439,0.094777,0.103285,0.021299,0.000000,...,0.051103,0.108302,0.080367,0.085867,0.084484,0.163899,0.098824,0.033273,0.124279,0.115120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10925,0.081771,0.068417,0.086074,0.015606,0.104073,0.036264,0.006370,0.021962,0.112074,0.094909,...,0.111969,0.052786,0.015657,0.052943,0.030978,0.133198,0.113847,0.087798,0.098987,0.033894
10926,0.081771,0.068417,0.086074,0.015606,0.104073,0.036264,0.006370,0.021962,0.112074,0.094909,...,0.111969,0.052786,0.015657,0.052943,0.030978,0.133198,0.113847,0.087798,0.098987,0.033894
10935,0.026694,0.046077,0.047623,0.057874,0.067733,0.032553,0.053684,0.059732,0.064867,0.044411,...,0.080966,0.064782,0.039838,0.043844,0.040108,0.129508,0.109933,0.054517,0.089351,0.082927
10977,0.035329,0.032441,0.040433,0.055769,0.060332,0.038043,0.045762,0.048066,0.080253,0.059408,...,0.096548,0.049075,0.034013,0.028030,0.026697,0.115019,0.122278,0.069971,0.075096,0.080483


In [106]:
row_mins_df = pd.DataFrame(dist_mtrx.idxmin(axis=0))
row_mins_df

Unnamed: 0,0
2,2
5,5
6,6
8,8
9,9
...,...
10925,6
10926,6
10935,256
10977,239


In [88]:
df_missing # to do map min index to relevant row and then perform a join.

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,POINT (144.95690 -37.83012),,
5,5,https://www.domain.com.au/7-haverbrack-avenue-...,7 Haverbrack Avenue Malvern VIC 3144,3500.0,2,2,2,House,famili exclus resort style ground,208041194,...,-37.853487,,,,,,,POINT (145.03114 -37.85349),,
6,6,https://www.domain.com.au/603-862-glenferrie-r...,603/862 Glenferrie Road Hawthorn VIC 3122,3500.0,3,3,2,Apartment / Unit / Flat,hawthorn best penthous amaz citi view,207011519,...,-37.815528,,,,,,,POINT (145.03689 -37.81553),,
8,8,https://www.domain.com.au/9-keith-court-bright...,9 Keith Court Brighton VIC 3186,3500.0,4,2,0,House,applic,208011169,...,-37.917184,,,,,,,POINT (144.98827 -37.91718),,
9,9,https://www.domain.com.au/1-20-kent-avenue-bri...,1/20 Kent Avenue Brighton VIC 3186,3500.0,3,1,1,House,premier locat could next home,208011169,...,-37.895941,,,,,,,POINT (144.98673 -37.89594),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10925,10926,https://www.domain.com.au/3-37-elphin-grove-ha...,3/37 Elphin Grove Hawthorn VIC 3122,175.0,3,1,0,Semi-Detached,bedsit prime locat,207011519,...,-37.818968,,,,,,,POINT (145.04225 -37.81897),,
10926,10927,https://www.domain.com.au/10-37-elphin-grove-h...,10/37 Elphin Grove Hawthorn VIC 3122,175.0,0,1,0,Semi-Detached,studio apart ideal locat,207011519,...,-37.818968,,,,,,,POINT (145.04225 -37.81897),,
10935,10936,https://www.domain.com.au/1-60-the-avenue-prah...,1/60 The Avenue Prahran VIC 3181,170.0,1,1,0,Apartment / Unit / Flat,room hous south twin,206061136,...,-37.853143,,,,,,,POINT (144.99859 -37.85314),,
10977,10978,https://www.domain.com.au/670-chapel-street-so...,670 Chapel Street South Yarra VIC 3141,70.0,4,2,2,Carspace,vogu plaza car space,206061515,...,-37.837373,,,,,,,POINT (144.99668 -37.83737),,


In [111]:

# this function converts two floats to a point that is recognised by Shapely
def assign_nearest_centroid(row):
    print(row['index'])
    row_num = row['Unnamed: 0']
    print("row num", row_num)
    matching_row = row_mins_df.loc[row_num]
    print("match:", matching_row)
    return df_centroid['SA2_CODE_2021'].loc[row_num]


In [112]:
df_missing.reset_index()
df_missing["SA2_CODE_2021"] = df_missing.apply (lambda row: assign_nearest_centroid(row), axis=1)

https://www.domain.com.au/901-902-85-market-street-south-melbourne-vic-3205-14089455
row num 2
match: 0    2
Name: 2, dtype: int64
https://www.domain.com.au/7-haverbrack-avenue-malvern-vic-3144-16041473
row num 5
match: 0    5
Name: 5, dtype: int64
https://www.domain.com.au/603-862-glenferrie-road-hawthorn-vic-3122-15954871
row num 6
match: 0    6
Name: 6, dtype: int64
https://www.domain.com.au/9-keith-court-brighton-vic-3186-16058214
row num 8
match: 0    8
Name: 8, dtype: int64
https://www.domain.com.au/1-20-kent-avenue-brighton-vic-3186-16069407
row num 9
match: 0    9
Name: 9, dtype: int64
https://www.domain.com.au/9-9-11-adamson-street-brighton-vic-3186-16071478
row num 17
match: 0    17
Name: 17, dtype: int64
https://www.domain.com.au/5-wellington-street-brighton-vic-3186-15950472
row num 19
match: 0    19
Name: 19, dtype: int64
https://www.domain.com.au/251-350-st-kilda-road-melbourne-vic-3000-15951316
row num 21
match: 0    21
Name: 21, dtype: int64
https://www.domain.com.au/42

KeyError: 478