In [27]:
# imports
import pandas as pd

In [47]:
# read datasets
property_df = pd.read_csv("../data/raw/full_property_zones.csv")
income_and_pop = pd.read_csv("../data/raw/income_and_pop.csv")
australia_sf = gpd.read_file("../data/raw/shapefiles/Statistical_area_level2/SA2_2021_AUST_GDA2020.shp")
# seperate suburbs that are in Victoria
vic_sf = australia_sf[australia_sf['STE_NAME21'] == 'Victoria']

# drop the null location ID values
vic_sf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vic_sf.dropna(inplace=True)


In [38]:
df = property_df.merge(income_and_pop, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# shouldn't we do a left join instead of full outer join?
# we only care about the values in the property data, so the possible outcomes should be
# property data with income/pop data, or
# proprty data without income/pop data
# (we don't care about income/pop data without property data)

property_df.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514


In [39]:
import geopandas as gpd

df_centroid = income_and_pop.copy()

# replace object with shapely geometries
df_centroid = df_centroid.dropna(subset=["geometry"])
df_centroid['geometry'] = gpd.GeoSeries.from_wkt(df_centroid['geometry'])

df_centroid = gpd.GeoDataFrame(df_centroid)

In [40]:
# get centroids
df_centroid['centroid'] = df_centroid['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
df_centroid.head(1)

Unnamed: 0,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,201011001,865,370,2218,1952,13320,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",Alfredton,"(-37.54173636281507, 143.749330252453)"


Need the coordinate data for each property in order for them to be linked to the nearest suburb based on its centroid.

In [41]:
property_df2 = pd.read_csv("../data/raw/full_property_data.csv")
property_df2.head(1)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,index,name,cost_text,Bed,Bath,Park,property_type,desc_head
0,0,-37.865018,144.974682,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar


This one!

In [42]:
property_df = pd.merge(property_df, property_df2[['index', 'longitude', 'latitude']], on='index', how='left')

In [43]:
df = property_df.merge(df_centroid, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# check what columns we no longer need; remove them
unwanted_columns = ["Unnamed: 0"]
df.drop(columns=unwanted_columns, inplace=True)

df.head(3)

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514,144.974682,-37.865018,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,"(-37.861344917872316, 144.9762676334905)"
1,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,4500.0,3,2,2,Apartment / Unit / Flat,arguabl captiv penthous,206041503,144.974005,-37.814172,206041503.0,884.0,375.0,2248.0,1511.0,10533.0,"POLYGON ((144.96534 -37.81137, 144.96576 -37.8...",Melbourne CBD - East,"(-37.81278701743976, 144.96859287374974)"
2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,144.956904,-37.830116,,,,,,,,,


In [44]:
df_missing = df.copy()  # dataframe with missing SA2 code only
df_missing = df_missing[df_missing["SA2_CODE_2021"].isna()]

# create POINT() geometry for the longitude and latitude of each property
df_missing = gpd.GeoDataFrame(
    df_missing, geometry=gpd.points_from_xy(df_missing.longitude, df_missing.latitude))
df_missing.head(1)

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,144.956904,-37.830116,,,,,,,POINT (144.95690 -37.83012),,


In [None]:
# now that we have the location coordinate of each property, impute all the missing values with the closest centroid
# 1: find the difference between df_missing["geometry"] and every single value in df_centroid["centroid"]
# 2: we want to get the smallest difference
# 3: once found, the SA2 code of that centroid, along with the rest of the values in df_centroid, will be imputed to df
# 4: voila

In [55]:
from shapely.geometry import Point, Polygon
# this function converts two floats to a point that is recognised by Shapely
def convert_to_point(row):
    x = float(row["latitude"])
    y = float(row["longitude"])
    return Point(y, x)

In [58]:
df_centroid['point'] = df.apply (lambda row: convert_to_point(row), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [70]:
s = gpd.GeoSeries([*df_centroid["point"]])
s

0      POINT (144.97468 -37.86502)
1      POINT (144.97400 -37.81417)
2      POINT (144.95690 -37.83012)
3      POINT (145.05209 -37.83108)
4      POINT (144.93824 -37.82240)
                  ...             
466    POINT (144.93458 -37.74056)
467    POINT (145.08085 -37.92607)
468    POINT (145.01910 -37.90366)
469    POINT (144.95291 -37.77635)
470    POINT (145.07589 -37.82312)
Length: 471, dtype: geometry

In [65]:
df_missing["geometry"].head(3)

2    POINT (144.95690 -37.83012)
5    POINT (145.03114 -37.85349)
6    POINT (145.03689 -37.81553)
Name: geometry, dtype: geometry

In [72]:
# this does not work.
s.distance(df_missing['geometry'].head(1)).dropna()

  warn("The indices of the two GeoSeries are different.")


2    0.0
dtype: float64

In [76]:
dist_mtrx = s.geometry.apply(lambda g: df_missing['geometry'].distance(g))

In [77]:
dist_mtrx.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,461,462,463,464,465,466,467,468,469,470
2,0.039168,0.023380,0.000000,0.095195,0.020200,0.077829,0.081308,0.077338,0.092545,0.072266,...,0.120668,0.059464,0.072447,0.038703,0.056613,0.092297,0.156747,0.096312,0.053913,0.119193
5,0.057625,0.069356,0.077829,0.030676,0.097969,0.000000,0.038393,0.051929,0.076780,0.061439,...,0.076372,0.072515,0.025769,0.059729,0.043052,0.148583,0.087974,0.051595,0.109864,0.054080
6,0.079494,0.062902,0.081308,0.021748,0.098894,0.038393,0.000000,0.016099,0.112686,0.094777,...,0.114698,0.046424,0.014410,0.047149,0.025386,0.126839,0.118965,0.089908,0.092670,0.039732
8,0.053907,0.103994,0.092545,0.107178,0.107181,0.076780,0.112686,0.122472,0.000000,0.021299,...,0.038932,0.129286,0.098372,0.107061,0.104625,0.184604,0.093003,0.033663,0.145203,0.128551
9,0.033187,0.082753,0.072266,0.092083,0.088092,0.061439,0.094777,0.103285,0.021299,0.000000,...,0.051103,0.108302,0.080367,0.085867,0.084484,0.163899,0.098824,0.033273,0.124279,0.115120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10925,0.081771,0.068417,0.086074,0.015606,0.104073,0.036264,0.006370,0.021962,0.112074,0.094909,...,0.111969,0.052786,0.015657,0.052943,0.030978,0.133198,0.113847,0.087798,0.098987,0.033894
10926,0.081771,0.068417,0.086074,0.015606,0.104073,0.036264,0.006370,0.021962,0.112074,0.094909,...,0.111969,0.052786,0.015657,0.052943,0.030978,0.133198,0.113847,0.087798,0.098987,0.033894
10935,0.026694,0.046077,0.047623,0.057874,0.067733,0.032553,0.053684,0.059732,0.064867,0.044411,...,0.080966,0.064782,0.039838,0.043844,0.040108,0.129508,0.109933,0.054517,0.089351,0.082927
10977,0.035329,0.032441,0.040433,0.055769,0.060332,0.038043,0.045762,0.048066,0.080253,0.059408,...,0.096548,0.049075,0.034013,0.028030,0.026697,0.115019,0.122278,0.069971,0.075096,0.080483


In [80]:
dist_mtrx.idxmin(axis=0)

2          2
5          5
6          6
8          8
9          9
        ... 
10925      6
10926      6
10935    256
10977    239
11001     81
Length: 1549, dtype: int64

In [None]:
df_centroid[row]['SA2_CODE_2021']