In [1]:
# imports
import geopandas as gpd
import pandas as pd

In [2]:
# read datasets
property_df = pd.read_csv("../data/raw/full_property_zones.csv")
income_and_pop = pd.read_csv("../data/raw/income_and_pop.csv")
australia_sf = gpd.read_file("../data/raw/shapefiles/Statistical_area_level2/SA2_2021_AUST_GDA2020.shp")
# seperate suburbs that are in Victoria
vic_sf = australia_sf[australia_sf['STE_NAME21'] == 'Victoria']

# drop the null location ID values
vic_sf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vic_sf.dropna(inplace=True)


In [3]:
df = property_df.merge(income_and_pop, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# shouldn't we do a left join instead of full outer join?
# we only care about the values in the property data, so the possible outcomes should be
# property data with income/pop data, or
# proprty data without income/pop data
# (we don't care about income/pop data without property data)

property_df.head(1)

Unnamed: 0.1,Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID
0,0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514


In [4]:
df_centroid = income_and_pop.copy()
df_centroid["geometry"]

# replace object with shapely geometries
df_centroid = df_centroid.dropna(subset=["geometry"])
df_centroid['geometry'] = gpd.GeoSeries.from_wkt(df_centroid['geometry'])

df_centroid = gpd.GeoDataFrame(df_centroid)

In [5]:
# get centroids
df_centroid['centroid'] = df_centroid['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
df_centroid["centroid"]

0         (-37.54173636281507, 143.749330252453)
1        (-37.5561439450457, 143.83665489612585)
2        (-37.643854141582494, 143.880777903821)
3       (-37.58222851797997, 143.77847784283048)
4       (-37.62024909240558, 143.74623319717654)
                         ...                    
466       (-38.64208964017761, 143.553904461219)
467      (-38.09471476925597, 142.7111928080641)
468      (-38.15666281521874, 142.1487545818805)
469    (-38.344636082354285, 142.49576449196164)
470     (-38.38986160903671, 142.57425654255778)
Name: centroid, Length: 471, dtype: object

Need the coordinate data for each property in order for them to be linked to the nearest suburb based on its centroid.

In [6]:
property_df2 = pd.read_csv("../data/raw/full_property_data.csv")
property_df2.head(1)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,index,name,cost_text,Bed,Bath,Park,property_type,desc_head
0,0,-37.865018,144.974682,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar


This one!

In [7]:
property_df = pd.merge(property_df, property_df2[['index', 'longitude', 'latitude']], on='index', how='left')

In [8]:
df = property_df.merge(df_centroid, how='left', left_on='LocID', right_on='SA2_CODE_2021')

# check what columns we no longer need; remove them
unwanted_columns = ["Unnamed: 0"]
df.drop(columns=unwanted_columns, inplace=True)

df.tail(3)

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
10999,https://www.domain.com.au/carpark5118-70-south...,Carpark5118/70 Southbank Boulevard Southbank V...,50.0,1,1,0,House,car park space avail leas,206041509,144.966112,-37.823165,206041509.0,1152.0,408.0,2385.0,1880.0,16090.0,"POLYGON ((144.96589 -37.82180, 144.96622 -37.8...",Southbank - East,"(-37.826356946991034, 144.9661201794769)"
11000,https://www.domain.com.au/133-fitzroy-street-s...,133 Fitzroy Street St Kilda VIC 3182,50.0,1,1,0,Carspace,secur car space,206051514,144.97822,-37.859571,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,"POLYGON ((144.97031 -37.86077, 144.97018 -37.8...",St Kilda - West,"(-37.861344917872316, 144.9762676334905)"
11001,https://www.domain.com.au/32-st-edmonds-rd-pra...,32 St Edmonds Rd Prahran VIC 3181,40.0,1,1,0,Apartment / Unit / Flat,secur street undercov car park avail central,206061136,144.992002,-37.850119,,,,,,,,,


In [9]:
df_missing = df.copy()  # dataframe with missing SA2 code only
df_missing = df_missing[df_missing["SA2_CODE_2021"].isna()]

# create POINT() geometry for the longitude and latitude of each property
df_missing = gpd.GeoDataFrame(
    df_missing, geometry=gpd.points_from_xy(df_missing.longitude, df_missing.latitude))
df_missing.head(1)

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21,centroid
2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,144.956904,-37.830116,,,,,,,POINT (144.95690 -37.83012),,


In [10]:
# now that we have the location coordinate of each property, impute all the missing values with the closest centroid
# 1: find the difference between df_missing["geometry"] and every single value in df_centroid["centroid"]
# 2: we want to get the smallest difference
# 3: once found, the SA2 code of that centroid, along with the rest of the values in df_centroid, will be imputed to df
# 4: voila
df_missing.shape

(1549, 20)

In [11]:
# function taken from https://stackoverflow.com/a/56661833

from math import radians, cos, sin, asin, sqrt, inf

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [12]:
from shapely.geometry import Point, Polygon
# this function converts two floats to a point that is recognised by Shapely
def convert_to_point_lat_long(row):
    x = float(row["latitude"])
    y = float(row["longitude"])
    return (x, y)

In [13]:
df_missing['point'] = df_missing.apply (lambda row: convert_to_point_lat_long(row), axis=1)

In [14]:

distances = []

for point2 in df_missing["point"]:
    min_dist_idx = None
    min_dist = inf
    for i in range(len(df_centroid["centroid"])):
        point1 = df_centroid["centroid"][i]
        dist = haversine(point1[1], point1[0], point2[1], point2[0])
        if dist < min_dist:
            min_dist_idx = i
            min_dist = dist
    distances.append((min_dist_idx, min_dist))

In [15]:
distances

[(127, 0.656151124051563),
 (162, 2.378527810006939),
 (142, 1.1373565767090004),
 (158, 3.2204874537684),
 (164, 2.056297207444774),
 (158, 2.8190253325665307),
 (158, 3.3073569205510567),
 (128, 0.675272558432267),
 (158, 2.5422792768880242),
 (135, 1.3766006049450303),
 (130, 2.325931637245142),
 (141, 2.483334254782791),
 (130, 1.6873848383317893),
 (176, 1.6870563069762385),
 (164, 1.4025547815785397),
 (130, 2.578974885145557),
 (164, 1.6457053133970099),
 (164, 2.5266478766080813),
 (141, 2.2085481221885246),
 (141, 1.980091426636389),
 (162, 1.962175330361762),
 (164, 1.2137433522772736),
 (158, 1.4356182164632765),
 (164, 1.968545322851024),
 (158, 2.6530964161612305),
 (123, 1.1580813506384968),
 (135, 2.795774442983055),
 (176, 1.5009649723769762),
 (127, 0.6836056958221235),
 (160, 2.0530527300916015),
 (164, 2.088247123890884),
 (164, 1.6146018885618914),
 (158, 2.313138058788807),
 (141, 2.4063999387109294),
 (164, 2.0654574852673657),
 (135, 2.760361238249615),
 (132, 1.

In [16]:
df_missing.reset_index(inplace=True)

In [17]:

for i in range(len(distances)):
    d = distances[i]
    idx = d[0]
    sa_code = df_centroid["SA2_CODE_2021"][idx]
    df_missing.at[i, "SA2_CODE_2021"] = sa_code

In [18]:
df.dropna(subset="SA2_CODE_2021", inplace=True)

# check what columns we no longer need; remove them
unwanted_columns = ["Median_tot_prsnl_inc_weekly", "Median_rent_weekly", "Median_tot_fam_inc_weekly",
                    "Median_tot_hhd_inc_weekly", "total_population", "geometry", "centroid"]
df_missing.drop(columns=unwanted_columns, inplace=True)
unwanted_columns = ["geometry", "centroid"]
df.drop(columns=unwanted_columns, inplace=True)

df_missing = df_missing.merge(income_and_pop, how='inner', on='SA2_CODE_2021')
df_missing

  result.crs = self.crs
  object.__getattribute__(self, name)
  existing = getattr(self, name)
  return object.__getattribute__(self, name)


Unnamed: 0,level_0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,...,SA2_CODE_2021,SA2_NAME21_x,point,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,geometry,SA2_NAME21_y
0,2,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,...,206041508.0,,"(-37.8301164, 144.9569041)",1227,430,2681,2054,6447,POLYGON ((144.96478082572233 -37.8196989975616...,Southbank (West) - South Wharf
1,58,https://www.domain.com.au/85-market-street-sou...,85 Market Street South Melbourne VIC 3205,2000.0,2,3,0,Apartment / Unit / Flat,call self contain execut bedroom bathroom apar...,206051512,...,206041508.0,,"(-37.8302556, 144.9579151)",1227,430,2681,2054,6447,POLYGON ((144.96478082572233 -37.8196989975616...,Southbank (West) - South Wharf
2,94,https://www.domain.com.au/18-156-bay-street-po...,18/156 Bay Street Port Melbourne VIC 3207,1650.0,3,2,2,Apartment / Unit / Flat,villag penthous live,206051130,...,206041508.0,,"(-37.840077, 144.941296)",1227,430,2681,2054,6447,POLYGON ((144.96478082572233 -37.8196989975616...,Southbank (West) - South Wharf
3,103,https://www.domain.com.au/301-130-bay-street-p...,301/130 Bay Street Port Melbourne VIC 3207,1600.0,3,2,1,Apartment / Unit / Flat,sensat rooftop terrac,206051130,...,206041508.0,,"(-37.8406272, 144.9407724)",1227,430,2681,2054,6447,POLYGON ((144.96478082572233 -37.8196989975616...,Southbank (West) - South Wharf
4,120,https://www.domain.com.au/2702-89-gladstone-st...,2702/89 Gladstone Street South Melbourne VIC 3205,1500.0,2,2,1,Apartment / Unit / Flat,inclus execut bay view resid free park,206051511,...,206041508.0,,"(-37.8301549, 144.9500578)",1227,430,2681,2054,6447,POLYGON ((144.96478082572233 -37.8196989975616...,Southbank (West) - South Wharf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,8334,https://www.domain.com.au/10-33-carmichael-str...,10/33 Carmichael Street Ivanhoe East VIC 3079,390.0,2,1,1,Apartment / Unit / Flat,spaciou sun fill great view,209011201,...,209011200.0,,"(-37.7698392, 145.0580159)",1080,440,3041,2280,10950,POLYGON ((145.02852468101142 -37.7613463015652...,Ivanhoe
1545,1988,https://www.domain.com.au/83-clifton-street-ba...,83 Clifton Street Balwyn North VIC 3104,650.0,4,2,2,House,must see famili home,207011148,...,207021157.0,,"(-37.7930306, 145.1038125)",721,450,1957,1595,22318,POLYGON ((145.10435596639266 -37.7749230168646...,Doncaster
1546,4744,https://www.domain.com.au/3-20-epsom-road-asco...,3/20 Epsom Road Ascot Vale VIC 3032,495.0,3,1,0,Townhouse,signatur style conveni,206041120,...,206031113.0,,"(-37.7832302, 144.9168832)",1080,370,3024,2192,13275,POLYGON ((144.89943124240546 -37.7703933627976...,Ascot Vale
1547,6443,https://www.domain.com.au/3-52-56-epsom-road-a...,3/52-56 Epsom Road Ascot Vale VIC 3032,440.0,2,1,1,Apartment / Unit / Flat,fantast ground floor apart,206041120,...,206031113.0,,"(-37.7821112, 144.9158821)",1080,370,3024,2192,13275,POLYGON ((144.89943124240546 -37.7703933627976...,Ascot Vale


In [19]:
df_missing.drop(columns=["SA2_NAME21_x", "level_0", "point", "geometry"], inplace=True)

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,SA2_NAME21
0,https://www.domain.com.au/901-902-85-market-st...,901 & 902/85 Market Street South Melbourne VIC...,3750.0,2,2,1,Apartment / Unit / Flat,call exclus penthous short stay minimum week stay,206051512,144.956904,-37.830116,206041508.0,1227,430,2681,2054,6447,Southbank (West) - South Wharf
1,https://www.domain.com.au/85-market-street-sou...,85 Market Street South Melbourne VIC 3205,2000.0,2,3,0,Apartment / Unit / Flat,call self contain execut bedroom bathroom apar...,206051512,144.957915,-37.830256,206041508.0,1227,430,2681,2054,6447,Southbank (West) - South Wharf
2,https://www.domain.com.au/18-156-bay-street-po...,18/156 Bay Street Port Melbourne VIC 3207,1650.0,3,2,2,Apartment / Unit / Flat,villag penthous live,206051130,144.941296,-37.840077,206041508.0,1227,430,2681,2054,6447,Southbank (West) - South Wharf
3,https://www.domain.com.au/301-130-bay-street-p...,301/130 Bay Street Port Melbourne VIC 3207,1600.0,3,2,1,Apartment / Unit / Flat,sensat rooftop terrac,206051130,144.940772,-37.840627,206041508.0,1227,430,2681,2054,6447,Southbank (West) - South Wharf
4,https://www.domain.com.au/2702-89-gladstone-st...,2702/89 Gladstone Street South Melbourne VIC 3205,1500.0,2,2,1,Apartment / Unit / Flat,inclus execut bay view resid free park,206051511,144.950058,-37.830155,206041508.0,1227,430,2681,2054,6447,Southbank (West) - South Wharf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,https://www.domain.com.au/10-33-carmichael-str...,10/33 Carmichael Street Ivanhoe East VIC 3079,390.0,2,1,1,Apartment / Unit / Flat,spaciou sun fill great view,209011201,145.058016,-37.769839,209011200.0,1080,440,3041,2280,10950,Ivanhoe
1545,https://www.domain.com.au/83-clifton-street-ba...,83 Clifton Street Balwyn North VIC 3104,650.0,4,2,2,House,must see famili home,207011148,145.103813,-37.793031,207021157.0,721,450,1957,1595,22318,Doncaster
1546,https://www.domain.com.au/3-20-epsom-road-asco...,3/20 Epsom Road Ascot Vale VIC 3032,495.0,3,1,0,Townhouse,signatur style conveni,206041120,144.916883,-37.783230,206031113.0,1080,370,3024,2192,13275,Ascot Vale
1547,https://www.domain.com.au/3-52-56-epsom-road-a...,3/52-56 Epsom Road Ascot Vale VIC 3032,440.0,2,1,1,Apartment / Unit / Flat,fantast ground floor apart,206041120,144.915882,-37.782111,206031113.0,1080,370,3024,2192,13275,Ascot Vale


In [22]:
df_missing.rename(columns={"SA2_NAME21_y": "SA2_NAME21"}, inplace=True)

In [23]:
df2 = pd.concat([df, df_missing])
df2

Unnamed: 0,index,name,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,SA2_CODE_2021,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,SA2_NAME21
0,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,1,1,1,Apartment / Unit / Flat,noth els compar,206051514,144.974682,-37.865018,206051514.0,1236.0,392.0,2787.0,1849.0,14408.0,St Kilda - West
1,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,4500.0,3,2,2,Apartment / Unit / Flat,arguabl captiv penthous,206041503,144.974005,-37.814172,206041503.0,884.0,375.0,2248.0,1511.0,10533.0,Melbourne CBD - East
3,https://www.domain.com.au/hawthorn-east-vic-31...,Hawthorn East VIC 3123,3750.0,4,2,3,House,luxuri finest month leas avail unfurnish fulli...,207011152,145.052094,-37.831081,207011152.0,1228.0,411.0,3203.0,2228.0,14052.0,Hawthorn East
4,https://www.domain.com.au/50-south-wharf-drive...,50 South Wharf Drive Docklands VIC 3008,3750.0,2,2,1,Townhouse,luxuri style space,206041118,144.938237,-37.822397,206041118.0,1182.0,411.0,2462.0,1956.0,15634.0,Docklands
7,https://www.domain.com.au/1-molesworth-street-...,1 Molesworth Street Kew VIC 3101,3500.0,2,1,1,House,opul famili masterpiec central,207011522,145.028797,-37.801612,207011522.0,1133.0,456.0,3259.0,2459.0,11684.0,Kew - West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,https://www.domain.com.au/10-33-carmichael-str...,10/33 Carmichael Street Ivanhoe East VIC 3079,390.0,2,1,1,Apartment / Unit / Flat,spaciou sun fill great view,209011201,145.058016,-37.769839,209011200.0,1080.0,440.0,3041.0,2280.0,10950.0,Ivanhoe
1545,https://www.domain.com.au/83-clifton-street-ba...,83 Clifton Street Balwyn North VIC 3104,650.0,4,2,2,House,must see famili home,207011148,145.103813,-37.793031,207021157.0,721.0,450.0,1957.0,1595.0,22318.0,Doncaster
1546,https://www.domain.com.au/3-20-epsom-road-asco...,3/20 Epsom Road Ascot Vale VIC 3032,495.0,3,1,0,Townhouse,signatur style conveni,206041120,144.916883,-37.783230,206031113.0,1080.0,370.0,3024.0,2192.0,13275.0,Ascot Vale
1547,https://www.domain.com.au/3-52-56-epsom-road-a...,3/52-56 Epsom Road Ascot Vale VIC 3032,440.0,2,1,1,Apartment / Unit / Flat,fantast ground floor apart,206041120,144.915882,-37.782111,206031113.0,1080.0,370.0,3024.0,2192.0,13275.0,Ascot Vale


In [24]:
# save our data
filename = "../data/raw/property_and_income.csv"
df2.to_csv(filename)