In [156]:
# imports
import geopandas as gpd
import pandas as pd

In [157]:
# read datasets
property_df = pd.read_csv("../data/raw/full_property_zones.csv")
income_and_pop = pd.read_csv("../data/raw/income_and_pop.csv")
australia_sf = gpd.read_file("../data/raw/shapefiles/Statistical_area_level2/SA2_2021_AUST_GDA2020.shp")
# seperate suburbs that are in Victoria
vic_sf = australia_sf[australia_sf['STE_NAME21'] == 'Victoria']

# drop the null location ID values
vic_sf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vic_sf.dropna(inplace=True)


In [158]:
df = property_df.merge(income_and_pop, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# shouldn't we do a left join instead of full outer join?
# we only care about the values in the property data, so the possible outcomes should be
# property data with income/pop data, or
# proprty data without income/pop data
# (we don't care about income/pop data without property data)

property_df.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514


In [159]:
df_centroid = income_and_pop.copy()
df_centroid["geometry"]

# replace object with shapely geometries
df_centroid = df_centroid.dropna(subset=["geometry"])
df_centroid['geometry'] = gpd.GeoSeries.from_wkt(df_centroid['geometry'])

df_centroid = gpd.GeoDataFrame(df_centroid)

In [160]:
# get centroids
df_centroid['centroid'] = df_centroid['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
df_centroid["centroid"]

0         (-37.54173636281507, 143.749330252453)
1        (-37.5561439450457, 143.83665489612585)
2        (-37.643854141582494, 143.880777903821)
3       (-37.58222851797997, 143.77847784283048)
4       (-37.62024909240558, 143.74623319717654)
                         ...                    
466       (-38.64208964017761, 143.553904461219)
467      (-38.09471476925597, 142.7111928080641)
468      (-38.15666281521874, 142.1487545818805)
469    (-38.344636082354285, 142.49576449196164)
470     (-38.38986160903671, 142.57425654255778)
Name: centroid, Length: 471, dtype: object

Need the coordinate data for each property in order for them to be linked to the nearest suburb based on its centroid.

In [161]:
property_df2 = pd.read_csv("../data/raw/full_property_data.csv")
property_df2.head(1)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,index,name,cost_text,Bed,Bath,Park,property_type,desc_head
0,0,-37.865018,144.974682,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar


This one!

In [162]:
property_df = pd.merge(property_df, property_df2[['index', 'longitude', 'latitude']], on='index', how='left')

In [184]:
df = property_df.merge(df_centroid, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# check what columns we no longer need; remove them
#unwanted_columns = ["Unnamed: 0"]
#df.drop(columns=unwanted_columns, inplace=True)

df.tail(3)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
10999,11000,https://www.domain.com.au/carpark5118-70-south...,Carpark5118/70 Southbank Boulevard Southbank V...,50.0,1,1,0,House,car park space avail leas,206041509,...,-37.823165,206041509.0,1152.0,408.0,2385.0,1880.0,16090.0,"POLYGON ((144.96589 -37.82180, 144.96622 -37.8...",Southbank - East,POINT (144.9661201794769 -37.826356946991034)
11000,11001,https://www.domain.com.au/133-fitzroy-street-s...,133 Fitzroy Street St Kilda VIC 3182,50.0,1,1,0,Carspace,secur car space,206051514,...,-37.859571,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,POINT (144.9762676334905 -37.861344917872316)
11001,11002,https://www.domain.com.au/32-st-edmonds-rd-pra...,32 St Edmonds Rd Prahran VIC 3181,40.0,1,1,0,Apartment / Unit / Flat,secur street undercov car park avail central,206061136,...,-37.850119,,,,,,,,,


In [164]:
df_missing = df.copy()  # dataframe with missing SA2 code only
df_missing = df_missing[df_missing["SA2_CODE_2021"].isna()]

# create POINT() geometry for the longitude and latitude of each property
df_missing = gpd.GeoDataFrame(
    df_missing, geometry=gpd.points_from_xy(df_missing.longitude, df_missing.latitude))
df_missing.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,,,,,,,POINT (144.95690 -37.83012),,


In [165]:
# now that we have the location coordinate of each property, impute all the missing values with the closest centroid
# 1: find the difference between df_missing["geometry"] and every single value in df_centroid["centroid"]
# 2: we want to get the smallest difference
# 3: once found, the SA2 code of that centroid, along with the rest of the values in df_centroid, will be imputed to df
# 4: voila
df_missing.shape

(1549, 21)

In [166]:
from shapely.geometry import Point, Polygon
# this function converts two floats to a point that is recognised by Shapely
def convert_to_point(row):
    x = float(row["latitude"])
    y = float(row["longitude"])
    return Point(y, x)

In [167]:
df_missing['point'] = df_missing.apply (lambda row: convert_to_point(row), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [168]:
# get the point series
missing = gpd.GeoSeries([*df_missing["point"]])
missing

0       POINT (144.95690 -37.83012)
1       POINT (145.03114 -37.85349)
2       POINT (145.03689 -37.81553)
3       POINT (144.98827 -37.91718)
4       POINT (144.98673 -37.89594)
                   ...             
1544    POINT (145.04225 -37.81897)
1545    POINT (145.04225 -37.81897)
1546    POINT (144.99859 -37.85314)
1547    POINT (144.99668 -37.83737)
1548    POINT (144.99200 -37.85012)
Length: 1549, dtype: geometry

In [169]:
for i in range(df_centroid.shape[0]):
    df_centroid.iloc[i, -1] = Point(df_centroid["centroid"][i][1], df_centroid["centroid"][i][0])
df_centroid["centroid"]
#df_centroid.iloc[0, -1] = Point(df_centroid["centroid"][0][1], df_centroid["centroid"][0][0])
#df_centroid

  applied = getattr(b, f)(**kwargs)


0         POINT (143.749330252453 -37.54173636281507)
1        POINT (143.83665489612585 -37.5561439450457)
2        POINT (143.880777903821 -37.643854141582494)
3       POINT (143.77847784283048 -37.58222851797997)
4       POINT (143.74623319717654 -37.62024909240558)
                            ...                      
466       POINT (143.553904461219 -38.64208964017761)
467      POINT (142.7111928080641 -38.09471476925597)
468      POINT (142.1487545818805 -38.15666281521874)
469    POINT (142.49576449196164 -38.344636082354285)
470     POINT (142.57425654255778 -38.38986160903671)
Name: centroid, Length: 471, dtype: object

In [170]:
# get the point series
centroid = gpd.GeoSeries([*df_centroid["centroid"]])
centroid

0      POINT (143.74933 -37.54174)
1      POINT (143.83665 -37.55614)
2      POINT (143.88078 -37.64385)
3      POINT (143.77848 -37.58223)
4      POINT (143.74623 -37.62025)
                  ...             
466    POINT (143.55390 -38.64209)
467    POINT (142.71119 -38.09471)
468    POINT (142.14875 -38.15666)
469    POINT (142.49576 -38.34464)
470    POINT (142.57426 -38.38986)
Length: 471, dtype: geometry

In [171]:
df_missing["geometry"]

2        POINT (144.95690 -37.83012)
5        POINT (145.03114 -37.85349)
6        POINT (145.03689 -37.81553)
8        POINT (144.98827 -37.91718)
9        POINT (144.98673 -37.89594)
                    ...             
10925    POINT (145.04225 -37.81897)
10926    POINT (145.04225 -37.81897)
10935    POINT (144.99859 -37.85314)
10977    POINT (144.99668 -37.83737)
11001    POINT (144.99200 -37.85012)
Name: geometry, Length: 1549, dtype: geometry

In [172]:
# distance matrix between the centroids and the property coordinates
dist_mtrx = missing.geometry.apply(lambda g: centroid.distance(g))
dist_mtrx.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1539,1540,1541,1542,1543,1544,1545,1546,1547,1548
0,1.241530,1.319177,1.316350,1.294578,1.287096,1.304470,1.294889,1.255339,1.302560,1.282561,...,1.278616,1.265568,1.280365,1.278337,1.279938,1.322312,1.322312,1.287487,1.281906,1.280365
1,1.153265,1.230939,1.227945,1.206883,1.199222,1.216728,1.207176,1.167062,1.214871,1.194232,...,1.190419,1.177179,1.192161,1.190127,1.191666,1.233914,1.233914,1.199292,1.193628,1.192161
2,1.092127,1.169309,1.168791,1.140722,1.134317,1.150862,1.141131,1.106049,1.148664,1.133875,...,1.128393,1.117838,1.130206,1.128251,1.130525,1.174602,1.174602,1.137236,1.132558,1.130206
3,1.204216,1.281697,1.279858,1.255305,1.248313,1.265315,1.255661,1.218080,1.263272,1.245574,...,1.240964,1.228979,1.242742,1.240745,1.242638,1.285758,1.285758,1.249827,1.244635,1.242742
4,1.228726,1.305906,1.305348,1.277038,1.270762,1.287208,1.277459,1.242648,1.284972,1.270467,...,1.264986,1.254398,1.266799,1.264846,1.267125,1.311167,1.311167,1.273828,1.269158,1.266799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,1.621021,1.674551,1.697780,1.607138,1.615464,1.617748,1.608655,1.632600,1.612522,1.659772,...,1.639708,1.657020,1.641750,1.641054,1.649808,1.700797,1.700797,1.646072,1.652021,1.641750
467,2.261246,2.332456,2.342397,2.283987,2.284201,2.295014,2.284956,2.274982,2.291215,2.304171,...,2.291861,2.292997,2.293887,2.292382,2.297697,2.347313,2.347313,2.300118,2.299930,2.293887
468,2.827072,2.898287,2.908215,2.849596,2.849925,2.860630,2.850576,2.840812,2.856801,2.870001,...,2.857698,2.858785,2.859725,2.858221,2.863535,2.913138,2.913138,2.865953,2.865768,2.859725
469,2.514347,2.582511,2.595628,2.528893,2.531053,2.539972,2.530008,2.527795,2.535774,2.556937,...,2.542692,2.547494,2.544749,2.543409,2.549582,2.600179,2.600179,2.550628,2.551842,2.544749


In [179]:
# get the index of the minimum element for each column
row_mins_df = pd.DataFrame(dist_mtrx.idxmin(axis=1))
row_mins_df = row_mins_df.rename(columns={row_mins_df.columns[0]: "min_centroid_index"})
row_mins_df
#row_mins_df["min_centroid_index"].unique()

Unnamed: 0,min_centroid_index
0,127
1,162
2,142
3,158
4,164
...,...
1544,142
1545,142
1546,130
1547,135


In [191]:
df_missing.index[0] # to do map min index to relevant row and then perform a join.
#import numpy as np
#df["min_centroid_index"] = np.nan
for i in range(df_missing.shape[0]):
    df_i = df_missing.index[i]
    df_missing.iat[i, 0] = i
#    df.iat[df_i, -1] = row_mins_df.iloc[i, 0]
#df.head(3)
df_missing

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid,point
2,0,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,,,,,,,POINT (144.95690 -37.83012),,,POINT (144.95690 -37.83012)
5,1,https://www.domain.com.au/7-haverbrack-avenue-...,7 Haverbrack Avenue Malvern VIC 3144,3500.0,2,2,2,House,famili exclus resort style ground,208041194,...,,,,,,,POINT (145.03114 -37.85349),,,POINT (145.03114 -37.85349)
6,2,https://www.domain.com.au/603-862-glenferrie-r...,603/862 Glenferrie Road Hawthorn VIC 3122,3500.0,3,3,2,Apartment / Unit / Flat,hawthorn best penthous amaz citi view,207011519,...,,,,,,,POINT (145.03689 -37.81553),,,POINT (145.03689 -37.81553)
8,3,https://www.domain.com.au/9-keith-court-bright...,9 Keith Court Brighton VIC 3186,3500.0,4,2,0,House,applic,208011169,...,,,,,,,POINT (144.98827 -37.91718),,,POINT (144.98827 -37.91718)
9,4,https://www.domain.com.au/1-20-kent-avenue-bri...,1/20 Kent Avenue Brighton VIC 3186,3500.0,3,1,1,House,premier locat could next home,208011169,...,,,,,,,POINT (144.98673 -37.89594),,,POINT (144.98673 -37.89594)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10925,1544,https://www.domain.com.au/3-37-elphin-grove-ha...,3/37 Elphin Grove Hawthorn VIC 3122,175.0,3,1,0,Semi-Detached,bedsit prime locat,207011519,...,,,,,,,POINT (145.04225 -37.81897),,,POINT (145.04225 -37.81897)
10926,1545,https://www.domain.com.au/10-37-elphin-grove-h...,10/37 Elphin Grove Hawthorn VIC 3122,175.0,0,1,0,Semi-Detached,studio apart ideal locat,207011519,...,,,,,,,POINT (145.04225 -37.81897),,,POINT (145.04225 -37.81897)
10935,1546,https://www.domain.com.au/1-60-the-avenue-prah...,1/60 The Avenue Prahran VIC 3181,170.0,1,1,0,Apartment / Unit / Flat,room hous south twin,206061136,...,,,,,,,POINT (144.99859 -37.85314),,,POINT (144.99859 -37.85314)
10977,1547,https://www.domain.com.au/670-chapel-street-so...,670 Chapel Street South Yarra VIC 3141,70.0,4,2,2,Carspace,vogu plaza car space,206061515,...,,,,,,,POINT (144.99668 -37.83737),,,POINT (144.99668 -37.83737)


In [200]:
# this function converts two floats to a point that is recognised by Shapely
#def assign_nearest_centroid(row):
#    print(row['index'])
#    row_num = row['Unnamed: 0']
#    print("row num", row_num)
#    matching_row = row_mins_df.loc[row_num]
#    print("match:", matching_row)
#    return df_centroid['SA2_CODE_2021'].loc[matching_row]

#df_missing["SA2_CODE_2021"] = df_missing.apply (lambda row: assign_nearest_centroid(row), axis=1)

In [214]:
for i in range(df_missing.shape[0]):
    #print("row index", df_missing.index[i])
    #print("row num", i)
    matching_row = row_mins_df.loc[i]
    #print("match:", matching_row)
    #print(df_centroid['SA2_CODE_2021'].loc[matching_row])
    #print("")
    
    row_index = df_missing.index[i]
    col_index = df.columns.get_loc("SA2_CODE_2021")
    df.iloc[row_index, col_index] = df_centroid['SA2_CODE_2021'].loc[matching_row]

# might need to check the values if it's correct
# only the SA2 code is joined, still need to join the rest
df

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514,...,-37.865018,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,POINT (144.9762676334905 -37.861344917872316)
1,1,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,4500.0,3,2,2,Apartment / Unit / Flat,arguabl captiv penthous,206041503,...,-37.814172,206041503.0,884.0,375.0,2248.0,1511.0,10533.0,"POLYGON ((144.96534 -37.81137, 144.96576 -37.8...",Melbourne CBD - East,POINT (144.96859287374974 -37.81278701743976)
2,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,-37.830116,206041508.0,,,,,,,,
3,3,https://www.domain.com.au/hawthorn-east-vic-31...,Hawthorn East VIC 3123,3750.0,4,2,3,House,luxuri finest month leas avail unfurnish fulli...,207011152,...,-37.831081,207011152.0,1228.0,411.0,3203.0,2228.0,14052.0,"POLYGON ((145.04435 -37.82616, 145.04453 -37.8...",Hawthorn East,POINT (145.04991149713413 -37.8312698416898)
4,4,https://www.domain.com.au/50-south-wharf-drive...,50 South Wharf Drive Docklands VIC 3008,3750.0,2,2,1,Townhouse,luxuri style space,206041118,...,-37.822397,206041118.0,1182.0,411.0,2462.0,1956.0,15634.0,"POLYGON ((144.94525 -37.81208, 144.94545 -37.8...",Docklands,POINT (144.94328587893943 -37.81794762815656)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10997,10998,https://www.domain.com.au/cp831-888-collins-st...,CP831/888 Collins Street Docklands VIC 3008,50.0,2,1,1,Apartment / Unit / Flat,street dockland car space rent,206041118,...,-37.820739,206041118.0,1182.0,411.0,2462.0,1956.0,15634.0,"POLYGON ((144.94525 -37.81208, 144.94545 -37.8...",Docklands,POINT (144.94328587893943 -37.81794762815656)
10998,10999,https://www.domain.com.au/cp-l5c15-135-a-becke...,CP L5C15/135 A'Beckett Street Melbourne VIC 3000,50.0,1,1,1,Apartment / Unit / Flat,street car space rent,206041504,...,-37.809678,206041504.0,638.0,361.0,1520.0,1089.0,17891.0,"POLYGON ((144.95569 -37.80771, 144.95599 -37.8...",Melbourne CBD - North,POINT (144.96088380246076 -37.80919110515081)
10999,11000,https://www.domain.com.au/carpark5118-70-south...,Carpark5118/70 Southbank Boulevard Southbank V...,50.0,1,1,0,House,car park space avail leas,206041509,...,-37.823165,206041509.0,1152.0,408.0,2385.0,1880.0,16090.0,"POLYGON ((144.96589 -37.82180, 144.96622 -37.8...",Southbank - East,POINT (144.9661201794769 -37.826356946991034)
11000,11001,https://www.domain.com.au/133-fitzroy-street-s...,133 Fitzroy Street St Kilda VIC 3182,50.0,1,1,0,Carspace,secur car space,206051514,...,-37.859571,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,POINT (144.9762676334905 -37.861344917872316)
