Feature engineering

In [185]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import geopy
import collections
import geopandas as gpd
from scipy.spatial.distance import cdist



In [74]:
# read in the scraped and pre-processed past listing data
files_dire = '../data/curated/'
df = pd.read_csv(f'{files_dire}/processed_listing.csv').iloc[: , 1:]
# sort the dataframe so the same property are followed with sorted list date
df = df.sort_values(['address', 'list_date'], ascending=[True, False]).reset_index().iloc[:,1:]
df.head()

Unnamed: 0,bed,bath,car,type,address,suburb,code,url,loc_address,lat,lon,weekly_rent,list_date
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01
1,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,349.0,2022-07-01
2,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,375.0,2022-04-01
3,3,1,2,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,430.0,2021-02-01
4,3,1,2,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,450.0,2021-02-01


In [75]:
# original size of the dataframe
len(df)

303640

In [76]:
# convert the date to a date format as it cannot be auto recognised by Pandas
df['list_date'] = df['list_date'].apply(pd.to_datetime)
# remove the data that has not been listed in 2021 or 2022
addresses = df.loc[df['list_date'].dt.year == 2022]['address'].tolist()
addresses = df.loc[df['list_date'].dt.year == 2021]['address'].tolist() + addresses
addresses = list(set(addresses))

In [None]:
len(addresses)

In [77]:
# size of the reduced dataframe
df = df.loc[df['address'].isin(addresses)]
len(df)

198426

In [78]:
# store the number of entries for the loop below
n_rows = len(df)

In [79]:
# print the number of unique properties to have a rough idea
len(addresses)

52904

In [80]:
# copy the first row of the dataframe to df2
# df2 will only store each property per row to replace the prvious dataframe
df2 = df.iloc[0:1,:]
# listing history will be stored in a new column as a list with the format of [date difference in years, price difference]
df2 = df2.assign(list_history = '')
df2.head()

Unnamed: 0,bed,bath,car,type,address,suburb,code,url,loc_address,lat,lon,weekly_rent,list_date,list_history
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01,


In [81]:
# initialise for loop variables
n_property = 0
past_listing = []
list_date = df.iloc[0]['list_date']
list_price = df.iloc[0]['weekly_rent']

In [82]:
# run through all entries in the initial dataframe
for i in tqdm(range(1, n_rows)):
    # if the following row mataches with the previous one
    # calculate the date and price difference
    if df.iloc[i]['address'] == df2.iloc[n_property]['address']:
        days = (df.iloc[i]['list_date'] - list_date)/ np.timedelta64(1, 'Y')
        list_date = df.iloc[i]['list_date']
        price = list_price - df.iloc[i]['weekly_rent']
        list_price = df.iloc[i]['weekly_rent']
        if days != 0:
            temp = [days, price]
            past_listing.append(temp)
    # if the address is different (different property)
    # the past listings are stored
    else:
        df2.at[n_property, 'list_history'] = past_listing
        n_property += 1
        df2.loc[n_property] = df.iloc[i]
        past_listing = []
# manually store the last record
df2.at[n_property, 'list_history'] = past_listing

100%|██████████| 198425/198425 [03:53<00:00, 851.35it/s]


In [83]:
# a new column to store the number of times that the property has been listed for lease
df2['list_count'] = df2['list_history'].str.len()

In [84]:
# rename the column for legibility
df2 = df2.rename(columns={'code': 'postcode'}, errors="coerce")
# this file is stored to share with other group members
# df2.to_csv('../data/curated/processed_listing_combined.csv')

In [85]:
df2.head(20)

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,lon,weekly_rent,list_date,list_history,list_count
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01,"[[-0.08487511721664373, 176.0], [-0.2491495376...",2
1,3,1,2,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,430.0,2021-02-01,"[[-1.1608725709631273, -75.00000000000006]]",1
2,3,1,1,House,"0 LANCASTER DRIVE, MANGALORE",Mangalore,3663,https://www.oldlistings.com.au/real-estate/VIC...,"Lancaster Drive, Mangalore, Shire of Strathbog...",-36.891826,145.183443,300.0,2022-02-01,"[[-7.000828216869614, 200.00000000000006], [-1...",2
3,2,1,1,AUF,"06/34 AUBURN RD, HAWTHORN",Hawthorn,3122,https://www.oldlistings.com.au/real-estate/VIC...,"Auburn Road, Hawthorn, Melbourne, City of Boro...",-37.829572,145.043714,325.0,2022-05-01,"[[2.2505595597445534, -90.0]]",1
4,1,1,1,AUF,"07/39 DARLING STREET, SOUTH YARRA",South Yarra,3141,https://www.oldlistings.com.au/real-estate/VIC...,"Darling Street, South Yarra, Melbourne, City o...",-37.835333,144.992038,290.0,2021-07-01,"[[4.914543077544371, 30.0], [-0.75292442692183...",3
5,2,1,1,AUF,"08/26 ROTHERWOOD STREET, RICHMOND",Richmond,3121,https://www.oldlistings.com.au/real-estate/VIC...,"Rotherwood Street, Richmond, Melbourne, City o...",-37.819132,144.991919,420.0,2021-09-01,"[[1.8343976946823002, -140.0], [-0.08487511721...",2
6,2,1,1,,"1 / 1 KELLY STREET, WERRIBEE, WERRIBEE",Werribee,3030,https://www.oldlistings.com.au/real-estate/VIC...,"Kelly Street, Werribee, Melbourne, City of Wyn...",-37.899365,144.665314,270.0,2022-05-01,"[[0.7556623339288281, 165.0]]",1
7,2,1,1,,"1 / 1 KINGFISHER COURT, KINGS PARK",Kings Park,3021,https://www.oldlistings.com.au/real-estate/VIC...,"Kingfisher Court, Kings Park, Melbourne, City ...",-37.726443,144.765013,320.0,2022-01-01,"[[-0.9226746613551271, -55.0], [-0.07939930320...",4
8,2,1,1,,"1 / 10 BETTINA ST, CLAYTON",Clayton,3168,https://www.oldlistings.com.au/real-estate/VIC...,"Bettina Street, Clayton, Melbourne, City of Mo...",-37.919015,145.137066,379.0,2022-02-01,"[[7.167840544295913, -120.0], [-0.334024654852...",3
9,3,2,2,,"1 / 10 FRANCIS STREET, BELMONT",Belmont,3216,https://www.oldlistings.com.au/real-estate/VIC...,"Francis Street, Belmont, City of Greater Geelo...",-38.181797,144.347595,380.0,2022-02-01,"[[5.333442849613613, -30.0], [-0.1670123274262...",6


In [86]:
# compare with the previous output to make sure we have roughly the expected amount of output
len(df2)

52904

In [87]:
# pivot to the number of past listings
col_list = (df2['list_count'].values.tolist())
col_list.sort()
counter = collections.Counter(col_list)
print(counter)
# it means there are 3xxxx properties that have been listed once in the past, etc

Counter({1: 19138, 2: 12393, 3: 7849, 4: 5017, 5: 3084, 6: 1993, 7: 1241, 8: 722, 9: 436, 0: 422, 10: 243, 11: 167, 12: 69, 13: 44, 14: 27, 15: 16, 16: 9, 17: 8, 19: 5, 20: 5, 18: 4, 21: 2, 29: 2, 34: 2, 23: 1, 24: 1, 26: 1, 27: 1, 30: 1, 42: 1})


### Embed SA2 area information to the existing dataframe

In [88]:
# read in the postcode information
postcode_df = pd.read_csv('../data/raw/abs/australian_postcodes.csv')
# only retain information that is relavent for faster running time
postcode_df = postcode_df.loc[postcode_df['state'] == 'VIC']
# rename columns to match dataframe so it can perform a 'vlookup' properly
postcode_df['locality'] = postcode_df['locality'].str.title()
postcode_df = postcode_df[['postcode', 'locality', 'SA2_NAME_2016', 'SA2_MAINCODE_2016', 'lgaregion']]
postcode_df = postcode_df.rename(columns={'locality': 'suburb'}, errors="raise")
postcode_df.head()

Unnamed: 0,postcode,suburb,SA2_NAME_2016,SA2_MAINCODE_2016,lgaregion
6151,3000,Melbourne,Melbourne,206041122.0,Melbourne
6152,3001,Melbourne,Melbourne,206041122.0,Moonee Valley
6153,3002,East Melbourne,East Melbourne,206041119.0,Yarra
6154,3003,West Melbourne,West Melbourne,206041127.0,Melbourne
6155,3004,Melbourne,Southbank,206041126.0,Yarra


In [208]:
# df3 will be the final output dataframe
# the property informations are merged with its SA2 information by matching both postcode and suburb name
df3 = pd.merge(df2, postcode_df, on=['postcode', 'suburb'])
df3['SA2_MAINCODE_2016'] = df3['SA2_MAINCODE_2016'].astype(int)
df3 = df3.rename(columns={'SA2_MAINCODE_2016': 'SA2'}, errors="raise")
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,lon,weekly_rent,list_date,list_history,list_count,SA2_NAME_2016,SA2,lgaregion
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01,"[[-0.08487511721664373, 176.0], [-0.2491495376...",2,Camberwell,207011149,Whitehorse
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,145.080403,850.0,2022-08-01,"[[5.746866807668877, -400.0], [-0.084875117216...",3,Camberwell,207011149,Whitehorse
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,145.095567,580.0,2021-07-01,"[[5.5031930840469006, -75.0], [-2.083547232318...",5,Camberwell,207011149,Whitehorse
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,145.072722,400.0,2022-06-01,"[[-0.9171988473411501, -40.0], [-1.42097373662...",3,Camberwell,207011149,Whitehorse
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,145.067238,435.0,2021-02-01,"[[-2.162946535520921, 35.0], [-0.0848751172166...",9,Camberwell,207011149,Whitehorse


In [209]:
len(df3)

52844

In [210]:
# metro_melb is a list of all LGAs of metropolitan melbourne
# that is defined https://liveinmelbourne.vic.gov.au/discover/melbourne-victoria/metropolitan-melbourne
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 'Glen Eira', 'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 'Maribyrnong', 'Maroondah', 'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 'Mornington Peninsula', 'Nillumbik', 'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea', 'Wyndham', 'Yarra', 'Yarra Ranges']

In [211]:
# only retain entries that are in metro melbourne as they are more related with the study
df3 = df3.loc[df3['lgaregion'].isin(metro_melb)]

In [212]:
# check the number of unique properties again
len(df3)

42736

### Embed income data

In [213]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
income_df = pd.read_csv(f'{files_dire}/abs/income_distribution.csv')
# only retain information that will be used
income_df = income_df[['SA2', 'Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']]
income_df = income_df.replace(',','', regex=True)
cols = ['Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']
income_df[cols] = income_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
income_df = income_df.rename(columns={'Earners': 'income_earner', 'Median age of earners': 'income_median_age', 'Median': 'income_median', 'Mean': 'income_mean', 'Top 10%': 'income_top_10_pct'}, errors="raise")
# information will be 'vlookup'ed by SA2 code, the data contains the number of earners in the SA2, median age of the earner
# median income of the SA2, average income of the SA2 and how much of the population are ranked 10 percent within whole Australia
income_df.head()

Unnamed: 0,SA2,income_earner,income_median_age,income_median,income_mean,income_top_10_pct
0,201011001,7989.0,42.0,53932.0,63668.0,28.7
1,201011002,7595.0,47.0,53688.0,77876.0,38.4
2,201011003,13598.0,43.0,50593.0,60367.0,29.3
3,201011004,12722.0,40.0,45828.0,50778.0,26.1
4,201011005,4249.0,46.0,52377.0,63258.0,29.3


In [214]:
# merge income information
df3 = pd.merge(df3, income_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,list_history,list_count,SA2_NAME_2016,SA2,lgaregion,income_earner,income_median_age,income_median,income_mean,income_top_10_pct
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,...,"[[-0.08487511721664373, 176.0], [-0.2491495376...",2,Camberwell,207011149,Whitehorse,14501.0,47.0,61520.0,101491.0,42.5
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,"[[5.746866807668877, -400.0], [-0.084875117216...",3,Camberwell,207011149,Whitehorse,14501.0,47.0,61520.0,101491.0,42.5
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,"[[5.5031930840469006, -75.0], [-2.083547232318...",5,Camberwell,207011149,Whitehorse,14501.0,47.0,61520.0,101491.0,42.5
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,"[[-0.9171988473411501, -40.0], [-1.42097373662...",3,Camberwell,207011149,Whitehorse,14501.0,47.0,61520.0,101491.0,42.5
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,"[[-2.162946535520921, 35.0], [-0.0848751172166...",9,Camberwell,207011149,Whitehorse,14501.0,47.0,61520.0,101491.0,42.5


In [215]:
len(df3)

42736

### Embed population statistics

In [216]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
population_df = pd.read_csv(f'{files_dire}/abs/population.csv')
population_df = population_df.iloc[: , 8:]
# age are grouped into following categories
# they are sumed to produce these columns
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_df['population_children'] = population_df[['0-4', '5–9', '10–14']].sum(axis = 1)
population_df = population_df.drop(['0-4', '5–9', '10–14'], axis=1)
population_df['population_prime_working'] = population_df[['25–29', '30–34', '35–39', '40–44', '45–49', '50–54']].sum(axis = 1)
population_df = population_df.drop(['25–29', '30–34', '35–39', '40–44', '45–49', '50–54'], axis=1)
population_df['population_elderly'] = population_df[['65–69', '70–74', '75–79', '80–84', '85 and over']].sum(axis = 1)
population_df = population_df.drop(['65–69', '70–74', '75–79', '80–84', '85 and over'], axis=1)
population_df['population_early_working'] = population_df.iloc[:,2] + population_df.iloc[:,3]
population_df['population_mature_working'] = population_df.iloc[:,4] + population_df.iloc[:,5]
population_df = population_df.drop(['15–19', '20–24', '55–59', '60–64', 'SA2 name'], axis=1)
population_df = population_df.rename(columns={'Total persons': 'population_total', 'SA2 code': 'SA2'}, errors="raise")
population_df.head()

Unnamed: 0,SA2,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working
0,201011001,16823,4075,6702,2098,2391,1557
1,201011002,12076,1806,4321,2750,1565,1634
2,201011005,7232,1532,2483,1295,944,978
3,201011006,10640,2299,4347,1543,1434,1017
4,201011007,4213,904,1533,594,559,623


In [217]:
df3 = pd.merge(df3, population_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,income_median_age,income_median,income_mean,income_top_10_pct,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,...,47.0,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,47.0,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,47.0,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,47.0,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,47.0,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712


In [218]:
len(df3)

31533

### Embed school locations

In [219]:
school_df = pd.read_csv('../data/raw/schools.csv')
school_df = school_df.drop(['Entity_Type', 'School_No', 'School_Status', 'Address_Line_1', 'Address_Line_2',
         'Address_Town', 'Address_State', 'Address_Postcode',
         'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Town',
         'Postal_State', 'Postal_Postcode', 'Full_Phone_No', 'LGA_ID',
         'LGA_Name'], axis=1)
school_df['geo_coordinate'] = [(x, y) for x,y in zip(school_df['Y'], school_df['X'])]
# only retain public schools
school_df = school_df.loc[school_df['Education_Sector']=='Government']

In [220]:
# this is adapted from stackoverflow
# https://codereview.stackexchange.com/questions/28207/finding-the-closest-point-to-a-list-of-points
def closest_point(point, points):
    # Find closest point from a list of points.
    return points[cdist([point], points).argmin()]
def match_value(df, col1, x, col2):
    # Match value x from col1 row to value in col2.
    return df[df[col1] == x][col2].values[0]

In [221]:
df3['geo_coordinate'] = [(x, y) for x,y in zip(df3['lat'], df3['lon'])]
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,income_median,income_mean,income_top_10_pct,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working,geo_coordinate
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,...,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712,"(-37.8362297, 145.0790033)"
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712,"(-37.8431012, 145.080403)"
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712,"(-37.8504051, 145.0955675)"
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712,"(-37.8467915, 145.0727223)"
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,61520.0,101491.0,42.5,21512,3506,7721,4504,3069,2712,"(-37.8411808, 145.0672378)"


##### primary schools

In [222]:
school_type = ['Primary', 'Pri/Sec']
school_primary_df = school_df.loc[school_df['School_Type'].isin(school_type)]

In [223]:
# primary school
df3['closest_primary_school_loc'] = tqdm([closest_point(x, list(school_primary_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['primary_school_name'] = tqdm([match_value(school_primary_df, 'geo_coordinate', x, 'School_Name') for x in df3['closest_primary_school_loc']])

100%|██████████| 31533/31533 [00:00<00:00, 5076925.57it/s]
100%|██████████| 31533/31533 [00:00<00:00, 5121553.13it/s]


##### secondary schools

In [224]:
school_type = ['Secondary', 'Pri/Sec']
school_secondary_df = school_df.loc[school_df['School_Type'].isin(school_type)]

In [225]:
# embed secondary school ranking information
# considering some family may rent a property for its free education
school_ranking = pd.read_html('https://bettereducation.com.au/Results/vcePublicSchoolResults.aspx')[-1]
school_ranking = school_ranking[['Better Education Rank', 'School', 'Unit 3-4 cohort']]
school_ranking = school_ranking.rename(columns={'School': 'School_Name', 'Better Education Rank': 'secondary_school_rank', 'Unit 3-4 cohort': 'secondary_cohort'}, errors="coerce")
school_secondary_df = pd.merge(school_secondary_df, school_ranking, on='School_Name')

In [226]:
df3['closest_secondary_school_loc'] = tqdm([closest_point(x, list(school_secondary_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['secondary_school_name'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'School_Name') for x in df3['closest_secondary_school_loc']])
df3['secondary_school_rank'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'secondary_school_rank') for x in df3['closest_secondary_school_loc']])
df3['secondary_school_cohort'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'secondary_cohort') for x in df3['closest_secondary_school_loc']])

100%|██████████| 31533/31533 [00:00<00:00, 5193143.87it/s]
100%|██████████| 31533/31533 [00:00<00:00, 5145062.94it/s]
100%|██████████| 31533/31533 [00:00<00:00, 5127907.41it/s]
100%|██████████| 31533/31533 [00:00<00:00, 4923279.78it/s]


In [227]:
df3['closest_primary_school_loc'] = df3['closest_primary_school_loc'].astype('str')
df3['closest_secondary_school_loc'] = df3['closest_secondary_school_loc'].astype('str')

In [228]:
df3[['pri_lat', 'pri_lon']] = df3['closest_primary_school_loc'].str.split(', ', 1, expand=True)
df3[['sec_lat', 'sec_lon']] = df3['closest_secondary_school_loc'].str.split(',', 1, expand=True)

  df3[['pri_lat', 'pri_lon']] = df3['closest_primary_school_loc'].str.split(', ', 1, expand=True)
  df3[['sec_lat', 'sec_lon']] = df3['closest_secondary_school_loc'].str.split(',', 1, expand=True)


In [229]:
bracket_remove = ['pri_lat', 'pri_lon', 'sec_lat', 'sec_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')

In [230]:
df3 = df3.drop(['closest_primary_school_loc', 'closest_secondary_school_loc'], axis = 1)

In [231]:
df3

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,population_mature_working,geo_coordinate,primary_school_name,secondary_school_name,secondary_school_rank,secondary_school_cohort,pri_lat,pri_lon,sec_lat,sec_lon
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.836230,...,2712,"(-37.8362297, 145.0790033)",Canterbury Primary School,Camberwell High School,40,290,-37.830963,145.083144,-37.830151,145.072821
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,2712,"(-37.8431012, 145.080403)",Hartwell Primary School,Camberwell High School,40,290,-37.848051,145.084466,-37.830151,145.072821
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,2712,"(-37.8504051, 145.0955675)",Wattle Park Primary School,Ashwood High School,17,42,-37.842635,145.097370,-37.864527,145.103328
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,2712,"(-37.8467915, 145.0727223)",Camberwell South Primary School,Camberwell High School,40,290,-37.847905,145.062612,-37.830151,145.072821
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,2712,"(-37.8411808, 145.0672378)",Camberwell South Primary School,Camberwell High School,40,290,-37.847905,145.062612,-37.830151,145.072821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31528,3,2,2,House,"25 CHARLES SMITH DRIVE, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Charles Smith Drive, Wonga Park, Melbourne, Ci...",-37.767758,...,1600,"(-37.7677579, 145.2703888)",Yarra Road Primary School,Mooroolbark College,74,258,-37.766401,145.272227,-37.775083,145.314230
31529,2,1,2,House,"389 YARRA ROAD, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Yarra Road, Wonga Park South, Wonga Park, Melb...",-37.751110,...,1600,"(-37.7511101, 145.2714333)",Yarra Road Primary School,Mooroolbark College,74,258,-37.766401,145.272227,-37.775083,145.314230
31530,2,1,1,AUF,"3/1 WEST END ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"West End Road, Warrandyte, Melbourne, City of ...",-37.743661,...,1600,"(-37.7436613, 145.2039103)",Andersons Creek Primary School,East Doncaster Secondary College,7,342,-37.748220,145.207680,-37.782127,145.158557
31531,3,2,2,House,"48 BEAUTY GULLY ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"Beauty Gully Road, Warrandyte, Melbourne, City...",-37.759570,...,1600,"(-37.7595698, 145.2126606)",Andersons Creek Primary School,Norwood Secondary College,85,255,-37.748220,145.207680,-37.799752,145.237605


In [232]:
df3.dtypes

bed                                   int64
bath                                  int64
car                                   int64
type                                 object
address                              object
suburb                               object
postcode                              int64
url                                  object
loc_address                          object
lat                                 float64
lon                                 float64
weekly_rent                         float64
list_date                    datetime64[ns]
list_history                         object
list_count                            int64
SA2_NAME_2016                        object
SA2                                   int64
lgaregion                            object
income_earner                       float64
income_median_age                   float64
income_median                       float64
income_mean                         float64
income_top_10_pct               

### embed population projection

In [233]:
population_projection_df = pd.read_csv('../data/raw/population_projection.csv')
population_projection_df = population_projection_df.loc[population_projection_df['SEX'] == 'Persons']
population_projection_df = population_projection_df.loc[population_projection_df['YEAR'] == 2027]
population_projection_df = population_projection_df.drop(['YEAR', 'SA2_NAME', 'SEX'], axis=1)
# they are categories as well with the previous population statistics
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_projection_df['proj_population_children'] = population_projection_df[['Age0-4', 'Age5-9', 'Age10-14']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age0-4', 'Age5-9', 'Age10-14'], axis=1)
population_projection_df['proj_population_early_working'] = population_projection_df[['Age15-19', 'Age20-24']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age15-19', 'Age20-24'], axis=1)
population_projection_df['proj_population_prime_working'] = population_projection_df[['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54'], axis=1)
population_projection_df['proj_population_mature_working'] = population_projection_df[['Age55-59', 'Age60-64']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age55-59', 'Age60-64'], axis=1)
population_projection_df['proj_population_elderly'] = population_projection_df[['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+'], axis=1)
population_projection_df = population_projection_df.rename(columns={'Total': 'proj_population_total', 'SA2_CODE': 'SA2'}, errors="raise")
population_projection_df.head()

Unnamed: 0,SA2,proj_population_total,proj_population_children,proj_population_early_working,proj_population_prime_working,proj_population_mature_working,proj_population_elderly
13862,201011001,18611,4389,2496,7275,1807,2644
13865,201011002,12252,1788,1796,3940,1674,3054
13868,201011003,26630,5094,2777,9809,3270,5680
13871,201011004,28423,4849,3734,11347,3056,5437
13874,201011005,8900,1919,1368,2966,1067,1580


In [234]:
df3 = pd.merge(df3, population_projection_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,pri_lat,pri_lon,sec_lat,sec_lon,proj_population_total,proj_population_children,proj_population_early_working,proj_population_prime_working,proj_population_mature_working,proj_population_elderly
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,...,-37.830963,145.083144,-37.830151,145.072821,25643,4215,4019,9588,2937,4884
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,-37.848051,145.084466,-37.830151,145.072821,25643,4215,4019,9588,2937,4884
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,-37.842635,145.09737,-37.864527,145.103328,25643,4215,4019,9588,2937,4884
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,-37.847905,145.062612,-37.830151,145.072821,25643,4215,4019,9588,2937,4884
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,-37.847905,145.062612,-37.830151,145.072821,25643,4215,4019,9588,2937,4884


In [235]:
len(df3)

31533

### hospitals

In [236]:
hospitals_df = pd.read_csv('../data/raw/hospitals.csv')
hospitals_df = hospitals_df.loc[hospitals_df['Emergency Capable'] == 'YES']
hospitals_df['full_address'] = hospitals_df['Location Address'] + ', ' +hospitals_df['Suburb'] + ', Victoria'
hospitals_df = hospitals_df[['Formal Name', 'full_address']]

In [237]:
# get the geocode of each hospital
temp = []
for i in tqdm(hospitals_df['full_address']):
    locator = geopy.Nominatim(user_agent="myGeocoder");
    location = locator.geocode(i,timeout=None);
    if location != None:
        info = [i, location.latitude, location.longitude]
        temp.append(info)

100%|██████████| 39/39 [00:31<00:00,  1.25it/s]


In [238]:
hospital_geo = pd.DataFrame(temp, columns=['full_address', 'lat', 'lon'])
hospitals_df = pd.merge(hospitals_df, hospital_geo, on=['full_address'])
hospitals_df = hospitals_df.drop('full_address', axis=1)
hospitals_df['geo_coordinate'] = [(x, y) for x,y in zip(hospitals_df['lat'], hospitals_df['lon'])]

In [239]:
hospitals_df.head()

Unnamed: 0,Formal Name,lat,lon,geo_coordinate
0,Albury Wodonga Health,-36.13204,146.880004,"(-36.1320403, 146.8800037)"
1,Angliss Hospital,-37.896522,145.310067,"(-37.8965217, 145.3100668)"
2,Austin Health - Austin Hospital,-37.756561,145.058657,"(-37.75656125, 145.05865657809136)"
3,Bairnsdale Regional Health Service,-37.817113,147.611512,"(-37.8171132, 147.6115117)"
4,Bendigo Health,-36.74862,144.285198,"(-36.7486197, 144.2851975)"


In [240]:
df3['closest_ed_loc'] = tqdm([closest_point(x, list(hospitals_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['closest_ed_name'] = tqdm([match_value(hospitals_df, 'geo_coordinate', x, 'Formal Name') for x in df3['closest_ed_loc']])

100%|██████████| 31533/31533 [00:00<00:00, 5129896.36it/s]
100%|██████████| 31533/31533 [00:00<00:00, 4744717.06it/s]


In [241]:
df3['closest_ed_loc'] = df3['closest_ed_loc'].astype('str')
df3[['ed_lat', 'ed_lon']] = df3['closest_ed_loc'].str.split(', ', 1, expand=True)

  df3[['ed_lat', 'ed_lon']] = df3['closest_ed_loc'].str.split(', ', 1, expand=True)


In [242]:
bracket_remove = ['ed_lat', 'ed_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')

In [243]:
df3 = df3.drop('closest_ed_loc', axis = 1)

In [244]:
df3

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,sec_lon,proj_population_total,proj_population_children,proj_population_early_working,proj_population_prime_working,proj_population_mature_working,proj_population_elderly,closest_ed_name,ed_lat,ed_lon
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.836230,...,145.072821,25643,4215,4019,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,145.072821,25643,4215,4019,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,145.103328,25643,4215,4019,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,145.072821,25643,4215,4019,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,145.072821,25643,4215,4019,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31528,3,2,2,House,"25 CHARLES SMITH DRIVE, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Charles Smith Drive, Wonga Park, Melbourne, Ci...",-37.767758,...,145.314230,11265,1706,1747,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901
31529,2,1,2,House,"389 YARRA ROAD, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Yarra Road, Wonga Park South, Wonga Park, Melb...",-37.751110,...,145.314230,11265,1706,1747,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901
31530,2,1,1,AUF,"3/1 WEST END ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"West End Road, Warrandyte, Melbourne, City of ...",-37.743661,...,145.158557,11265,1706,1747,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901
31531,3,2,2,House,"48 BEAUTY GULLY ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"Beauty Gully Road, Warrandyte, Melbourne, City...",-37.759570,...,145.237605,11265,1706,1747,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901


#### train station

In [245]:
train_df = gpd.read_file('../data/raw/PTV/PTV_METRO_TRAIN_STATION.shp')
train_df = train_df[['STOP_NAME', 'LATITUDE', 'LONGITUDE', 'ROUTEUSSP']]
train_df['geo_coordinate'] = [(x, y) for x, y in zip(train_df['LATITUDE'], train_df['LONGITUDE'])]
train_df['n_routes'] = train_df['ROUTEUSSP'].str.count(',') + 1
train_df = train_df.drop(['LONGITUDE', 'LATITUDE', 'ROUTEUSSP'], axis=1)
train_df

Unnamed: 0,STOP_NAME,geo_coordinate,n_routes
0,Royal Park Railway Station (Parkville),"(-37.781193, 144.952301)",1
1,Flemington Bridge Railway Station (North Melbo...,"(-37.78814, 144.939323)",1
2,Macaulay Railway Station (North Melbourne),"(-37.794267, 144.936166)",1
3,North Melbourne Railway Station (West Melbourne),"(-37.807419, 144.94257)",6
4,Clifton Hill Railway Station (Clifton Hill),"(-37.788657, 144.995417)",2
...,...,...,...
215,Coburg Railway Station (Coburg),"(-37.742345, 144.963336)",1
216,Moreland Railway Station (Coburg),"(-37.754485, 144.961823)",1
217,Anstey Railway Station (Brunswick),"(-37.761242, 144.960684)",1
218,Brunswick Railway Station (Brunswick),"(-37.767721, 144.959587)",1


In [251]:
df3['closest_train_loc'] = tqdm([closest_point(x, list(train_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['train_stop'] = tqdm([match_value(train_df, 'geo_coordinate', x, 'STOP_NAME') for x in df3['closest_train_loc']])
df3['train_n_lines'] = tqdm([match_value(train_df, 'geo_coordinate', x, 'n_routes') for x in df3['closest_train_loc']])

100%|██████████| 31533/31533 [00:00<00:00, 5331303.94it/s]
100%|██████████| 31533/31533 [00:00<00:00, 5459381.99it/s]
100%|██████████| 31533/31533 [00:00<00:00, 4745398.01it/s]


In [252]:
df3['closest_train_loc'] = df3['closest_train_loc'].astype('str')
df3[['train_lat', 'train_lon']] = df3['closest_train_loc'].str.split(', ', 1, expand=True)

  df3[['train_lat', 'train_lon']] = df3['closest_train_loc'].str.split(', ', 1, expand=True)


In [253]:
bracket_remove = ['train_lat', 'train_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')
df3 = df3.drop('closest_train_loc', axis=1)

In [254]:
df3

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,proj_population_prime_working,proj_population_mature_working,proj_population_elderly,closest_ed_name,ed_lat,ed_lon,train_stop,train_lat,train_lon,train_n_lines
0,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.836230,...,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672,Hartwell Railway Station (Camberwell),-37.843985,145.075560,1
1,4,2,3,,"1 NEVIS STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Nevis Street, Camberwell, Melbourne, City of B...",-37.843101,...,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672,Hartwell Railway Station (Camberwell),-37.843985,145.075560,1
2,3,2,2,Townhouse,"1/1017 TOORAK ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Toorak Road, Camberwell, Melbourne, City of Bo...",-37.850405,...,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672,Burwood Railway Station (Glen Iris),-37.851563,145.080511,1
3,2,1,1,AUF,"1/11 EDDY STREET, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Eddy Street, Camberwell, Melbourne, City of Bo...",-37.846792,...,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672,Hartwell Railway Station (Camberwell),-37.843985,145.075560,1
4,2,1,1,AUF,"1/11 ORANGE GROVE, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Orange Grove, Camberwell, Melbourne, City of B...",-37.841181,...,9588,2937,4884,Box Hill Hospital,-37.815458,145.119672,Willison Railway Station (Camberwell),-37.835716,145.070298,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31528,3,2,2,House,"25 CHARLES SMITH DRIVE, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Charles Smith Drive, Wonga Park, Melbourne, Ci...",-37.767758,...,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901,Croydon Railway Station (Croydon),-37.795437,145.280598,1
31529,2,1,2,House,"389 YARRA ROAD, WONGA PARK",Wonga Park,3115,https://www.oldlistings.com.au/real-estate/VIC...,"Yarra Road, Wonga Park South, Wonga Park, Melb...",-37.751110,...,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901,Croydon Railway Station (Croydon),-37.795437,145.280598,1
31530,2,1,1,AUF,"3/1 WEST END ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"West End Road, Warrandyte, Melbourne, City of ...",-37.743661,...,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901,Eltham Railway Station (Eltham),-37.713550,145.147822,1
31531,3,2,2,House,"48 BEAUTY GULLY ROAD, WARRANDYTE",Warrandyte,3113,https://www.oldlistings.com.au/real-estate/VIC...,"Beauty Gully Road, Warrandyte, Melbourne, City...",-37.759570,...,3789,1611,2412,Maroondah Hospital,-37.809273,145.247901,Ringwood Railway Station (Ringwood),-37.815660,145.229474,2


### final check and export

In [255]:
df3 = df3.drop('geo_coordinate', axis = 1)

In [256]:
df3.isna().sum()
# consider we still have a lot of entries
# and we want to produce the best model
# drop all rows with any NaN
# this shall not be applied if the dataset is small

bed                                  0
bath                                 0
car                                  0
type                              5895
address                              0
suburb                               0
postcode                             0
url                                  0
loc_address                          0
lat                                  0
lon                                  0
weekly_rent                          0
list_date                            0
list_history                         0
list_count                           0
SA2_NAME_2016                        0
SA2                                  0
lgaregion                            0
income_earner                        0
income_median_age                    0
income_median                        0
income_mean                          0
income_top_10_pct                  112
population_total                     0
population_children                  0
population_prime_working 

In [257]:
df3 = df3.dropna()

In [258]:
len(df3)

25551

In [270]:
features = ['lat', 'lon', 'pri_lat', 'pri_lon', 'sec_lat', 'sec_lon', 'ed_lat', 'ed_lon', 'train_lat', 'train_lon']

In [264]:
df3 = df3[df3['lat'] < -34]
df3 = df3[df3['lat'] > -40]
df3 = df3[df3['lon'] > 140]
df3 = df3[df3['lon'] < 150]

In [271]:
len(df3)

22284

In [266]:
df3.to_csv('../data/curated/listing_with_features.csv')

In [272]:
for i in features:
    print(i)
    print(df3[i].min())
    print(df3[i].max())

lat
-37.9999943
-37.1097723
lon
144.324648
146.3497923
pri_lat
-38.015915
-37.102798
pri_lon
144.255106
146.25688
sec_lat
-38.109956
-37.062676
sec_lon
144.340792
146.088565
ed_lat
-38.1726891
-36.35411275
ed_lon
144.667923
146.31380413326383
train_lat
-38.080614
-37.579091
train_lon
144.661118
145.486379
