Feature engineering

In [28]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import collections
from scipy.spatial.distance import cdist

In [6]:
# read in the scraped and pre-processed past listing data
files_dire = '../data/curated/'
df = pd.read_csv(f'{files_dire}/processed_listing.csv').iloc[: , 1:]
# sort the dataframe so the same property are followed with sorted list date
df = df.sort_values(['address', 'list_date'], ascending=[True, False])
df.head()

Unnamed: 0,bed,bath,car,type,address,suburb,code,url,loc_address,lat,lon,weekly_rent,list_date
96355,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01
96356,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,349.0,2022-07-01
96357,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,375.0,2022-04-01
181672,3,1,2,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,430.0,2021-02-01
181673,3,1,2,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,450.0,2021-02-01


In [7]:
# convert the date to a date format as it cannot be auto recognised by Pandas
df['list_date'] = df['list_date'].apply(pd.to_datetime)

In [8]:
# store the number of entries for the loop below
n_rows = len(df)

In [9]:
# print the number of unique properties to have a rough idea
len(list(set(df['address'].tolist())))

82556

In [11]:
# copy the first row of the dataframe to df2
# df2 will only store each property per row to replace the prvious dataframe
df2 = df.iloc[0:1,:]
# listing history will be stored in a new column as a list with the format of [date difference in years, price difference]
df2 = df2.assign(list_history = '')
df2.head()

Unnamed: 0,bed,bath,car,type,address,suburb,code,url,loc_address,lat,lon,weekly_rent,list_date,list_history
96355,3,1,2,AUF,". GLYNDON ROAD, CAMBERWELL",Camberwell,3124,https://www.oldlistings.com.au/real-estate/VIC...,"Glyndon Road, Camberwell, Melbourne, City of B...",-37.83623,145.079003,525.0,2022-08-01,


In [14]:
# initialise for loop variables
n_property = 0
past_listing = []
list_date = df.iloc[0]['list_date']
list_price = df.iloc[0]['weekly_rent']

In [15]:
# run through all entries in the initial dataframe
for i in tqdm(range(1, n_rows)):
    # if the following row mataches with the previous one
    # calculate the date and price difference
    if df.iloc[i]['address'] == df2.iloc[n_property]['address']:
        days = (df.iloc[i]['list_date'] - list_date)/ np.timedelta64(1, 'Y')
        list_date = df.iloc[i]['list_date']
        price = list_price - df.iloc[i]['weekly_rent']
        list_price = df.iloc[i]['weekly_rent']
        if days != 0:
            temp = [days, price]
            past_listing.append(temp)
    # if the address is different (different property)
    # the past listings are stored
    else:
        df2.at[n_property, 'list_history'] = past_listing
        n_property += 1
        df2.loc[n_property] = df.iloc[i]
        past_listing = []
df2.at[n_property, 'list_history'] = past_listing

100%|██████████| 303639/303639 [13:48<00:00, 366.65it/s]


In [16]:
# a new column to store the number of times that the property has been listed for lease
df2['list_count'] = df2['list_history'].str.len()

In [17]:
# rename the column for legibility
df2 = df2.rename(columns={'code': 'postcode'}, errors="raise")
df2.head()
# this file is stored to share with other group members
# df2.to_csv('../data/curated/processed_listing_combined.csv')

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,lon,weekly_rent,list_date,list_history,list_count
96355,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,144.807537,370.0,2019-03-01,[],0
0,,,,,,,,,,,,,NaT,"[[-0.08487511721664373, 176.0], [-0.2491495376...",2
1,3.0,1.0,2.0,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147.0,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,430.0,2021-02-01,[],0
2,3.0,1.0,2.0,House,".97 HUNTINGDALE ROAD, ASHWOOD",Ashwood,3147.0,https://www.oldlistings.com.au/real-estate/VIC...,"Huntingdale Road, Ashwood, Melbourne, City of ...",-37.864211,145.112405,450.0,2021-02-01,[],0
3,3.0,1.0,1.0,House,"0 LANCASTER DRIVE, MANGALORE",Mangalore,3663.0,https://www.oldlistings.com.au/real-estate/VIC...,"Lancaster Drive, Mangalore, Shire of Strathbog...",-36.891826,145.183443,300.0,2022-02-01,[],0


In [18]:
# pivot to the number of past listings
col_list = (df2['list_count'].values.tolist())
col_list.sort()
counter = collections.Counter(col_list)
print(counter)
# it means there are 3xxxx properties that have been listed once in the past, etc

Counter({0: 66114, 1: 24246, 2: 15489, 3: 9920, 4: 5973, 5: 3676, 6: 2263, 7: 1292, 8: 777, 9: 393, 10: 246, 11: 150, 12: 66, 13: 45, 14: 23, 15: 13, 16: 11, 17: 6, 19: 6, 20: 5, 18: 3, 29: 3, 21: 2, 22: 2, 23: 2, 25: 1, 26: 1, 28: 1, 30: 1, 31: 1, 33: 1, 34: 1, 41: 1})


### Embed SA2 area information to the existing dataframe

In [19]:
# read in the postcode information
postcode_df = pd.read_csv('../data/raw/abs/australian_postcodes.csv')
# only retain information that is relavent for faster running time
postcode_df = postcode_df.loc[postcode_df['state'] == 'VIC']
# rename columns to match dataframe so it can perform a 'vlookup' properly
postcode_df['locality'] = postcode_df['locality'].str.title()
postcode_df = postcode_df[['postcode', 'locality', 'SA2_NAME_2016', 'SA2_MAINCODE_2016']]
postcode_df = postcode_df.rename(columns={'locality': 'suburb'}, errors="raise")
postcode_df.head()

Unnamed: 0,postcode,suburb,SA2_NAME_2016,SA2_MAINCODE_2016
6151,3000,Melbourne,Melbourne,206041122.0
6152,3001,Melbourne,Melbourne,206041122.0
6153,3002,East Melbourne,East Melbourne,206041119.0
6154,3003,West Melbourne,West Melbourne,206041127.0
6155,3004,Melbourne,Southbank,206041126.0


In [20]:
# df3 will be the final output dataframe
# the property informations are merged with its SA2 information by matching both postcode and suburb name
df3 = pd.merge(df2, postcode_df, on=['postcode', 'suburb'])
df3['SA2_MAINCODE_2016'] = df3['SA2_MAINCODE_2016'].astype(int)
df3 = df3.rename(columns={'SA2_MAINCODE_2016': 'SA2'}, errors="raise")
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,lon,weekly_rent,list_date,list_history,list_count,SA2_NAME_2016,SA2
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,144.807537,370.0,2019-03-01,[],0,Sunshine West,213011338
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,144.815231,380.0,2020-08-01,[],0,Sunshine West,213011338
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,144.815231,380.0,2020-06-01,[],0,Sunshine West,213011338
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,144.790654,400.0,2017-02-01,[],0,Sunshine West,213011338
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,144.790654,390.0,2014-12-01,[],0,Sunshine West,213011338


### Embed income data

In [21]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
income_df = pd.read_csv(f'{files_dire}/abs/income_distribution.csv')
# only retain information that will be used
income_df = income_df[['SA2', 'Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']]
income_df = income_df.replace(',','', regex=True)
cols = ['Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']
income_df[cols] = income_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
income_df = income_df.rename(columns={'Earners': 'income_earner', 'Median age of earners': 'income_median_age', 'Median': 'income_median', 'Mean': 'income_mean', 'Top 10%': 'income_top_10_pct'}, errors="raise")
# information will be 'vlookup'ed by SA2 code, the data contains the number of earners in the SA2, median age of the earner
# median income of the SA2, average income of the SA2 and how much of the population are ranked 10 percent within whole Australia
income_df.head()

Unnamed: 0,SA2,income_earner,income_median_age,income_median,income_mean,income_top_10_pct
0,201011001,7989.0,42.0,53932.0,63668.0,28.7
1,201011002,7595.0,47.0,53688.0,77876.0,38.4
2,201011003,13598.0,43.0,50593.0,60367.0,29.3
3,201011004,12722.0,40.0,45828.0,50778.0,26.1
4,201011005,4249.0,46.0,52377.0,63258.0,29.3


In [22]:
# merge income information
df3 = pd.merge(df3, income_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,list_date,list_history,list_count,SA2_NAME_2016,SA2,income_earner,income_median_age,income_median,income_mean,income_top_10_pct
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,...,2019-03-01,[],0,Sunshine West,213011338,9833.0,37.0,44582.0,49129.0,25.9
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,2020-08-01,[],0,Sunshine West,213011338,9833.0,37.0,44582.0,49129.0,25.9
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,2020-06-01,[],0,Sunshine West,213011338,9833.0,37.0,44582.0,49129.0,25.9
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,2017-02-01,[],0,Sunshine West,213011338,9833.0,37.0,44582.0,49129.0,25.9
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,2014-12-01,[],0,Sunshine West,213011338,9833.0,37.0,44582.0,49129.0,25.9


### Embed population statistics

In [23]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
population_df = pd.read_csv(f'{files_dire}/abs/population.csv')
population_df = population_df.iloc[: , 8:]
# age are grouped into following categories
# they are sumed to produce these columns
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_df['population_children'] = population_df[['0-4', '5–9', '10–14']].sum(axis = 1)
population_df = population_df.drop(['0-4', '5–9', '10–14'], axis=1)
population_df['population_prime_working'] = population_df[['25–29', '30–34', '35–39', '40–44', '45–49', '50–54']].sum(axis = 1)
population_df = population_df.drop(['25–29', '30–34', '35–39', '40–44', '45–49', '50–54'], axis=1)
population_df['population_elderly'] = population_df[['65–69', '70–74', '75–79', '80–84', '85 and over']].sum(axis = 1)
population_df = population_df.drop(['65–69', '70–74', '75–79', '80–84', '85 and over'], axis=1)
population_df['population_early_working'] = population_df.iloc[:,2] + population_df.iloc[:,3]
population_df['population_mature_working'] = population_df.iloc[:,4] + population_df.iloc[:,5]
population_df = population_df.drop(['15–19', '20–24', '55–59', '60–64', 'SA2 name'], axis=1)
population_df = population_df.rename(columns={'Total persons': 'population_total', 'SA2 code': 'SA2'}, errors="raise")
population_df.head()

Unnamed: 0,SA2,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working
0,201011001,16823,4075,6702,2098,2391,1557
1,201011002,12076,1806,4321,2750,1565,1634
2,201011005,7232,1532,2483,1295,944,978
3,201011006,10640,2299,4347,1543,1434,1017
4,201011007,4213,904,1533,594,559,623


In [24]:
df3 = pd.merge(df3, population_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,income_median_age,income_median,income_mean,income_top_10_pct,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,...,37.0,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,37.0,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,37.0,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,37.0,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,37.0,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974


### Embed school locations

In [29]:
school_df = pd.read_csv('../data/raw/schools.csv')
school_df = school_df.drop(['Entity_Type', 'School_No', 'School_Status', 'Address_Line_1', 'Address_Line_2',
         'Address_Town', 'Address_State', 'Address_Postcode',
         'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Town',
         'Postal_State', 'Postal_Postcode', 'Full_Phone_No', 'LGA_ID',
         'LGA_Name'], axis=1)
school_type = ['Primary', 'Secondary', 'Pri/Sec']
school_df = school_df.loc[school_df['School_Type'].isin(school_type)]
school_df['geo_coordinate'] = [(x, y) for x,y in zip(school_df['Y'], school_df['X'])]
school_df.head()

Unnamed: 0,Education_Sector,School_Name,School_Type,X,Y,geo_coordinate
0,Government,Alberton Primary School,Primary,146.666601,-38.617713,"(-38.617713, 146.666601)"
1,Government,Allansford and District Primary School,Primary,142.590393,-38.386281,"(-38.386281, 142.590393)"
2,Government,Avoca Primary School,Primary,143.475649,-37.084502,"(-37.084502, 143.475649)"
3,Government,Avenel Primary School,Primary,145.234722,-36.901368,"(-36.901368, 145.234722)"
4,Government,Warrandyte Primary School,Primary,145.21398,-37.742675,"(-37.742675, 145.21398)"


In [30]:
df3['geo_coordinate'] = [(x, y) for x,y in zip(df3['lat'], df3['lon'])]
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,income_median,income_mean,income_top_10_pct,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working,geo_coordinate
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,...,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974,"(-37.7838225, 144.8075373)"
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974,"(-37.7880126, 144.8152311)"
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974,"(-37.7880126, 144.8152311)"
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974,"(-37.7975155, 144.7906544)"
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,44582.0,49129.0,25.9,18779,3161,8047,3343,2254,1974,"(-37.7975155, 144.7906544)"


In [31]:
# this is adapted from stackoverflow
# https://codereview.stackexchange.com/questions/28207/finding-the-closest-point-to-a-list-of-points
def closest_point(point, points):
    # Find closest point from a list of points.
    return points[cdist([point], points).argmin()]
def match_value(df, col1, x, col2):
    # Match value x from col1 row to value in col2.
    return df[df[col1] == x][col2].values[0]

In [32]:
df3['closest_school_loc'] = tqdm([closest_point(x, list(school_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['school_name'] = tqdm([match_value(school_df, 'geo_coordinate', x, 'School_Name') for x in df3['closest_school_loc']])

100%|██████████| 98185/98185 [00:00<00:00, 5392401.97it/s]
100%|██████████| 98185/98185 [00:00<00:00, 5436394.26it/s]


In [33]:
df3

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,income_top_10_pct,population_total,population_children,population_prime_working,population_elderly,population_early_working,population_mature_working,geo_coordinate,closest_school_loc,school_name
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,...,25.9,18779,3161,8047,3343,2254,1974,"(-37.7838225, 144.8075373)","(-37.787534, 144.807516)",St Paul's School
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,25.9,18779,3161,8047,3343,2254,1974,"(-37.7880126, 144.8152311)","(-37.79222, 144.818159)",Sunshine Heights Primary School
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,25.9,18779,3161,8047,3343,2254,1974,"(-37.7880126, 144.8152311)","(-37.79222, 144.818159)",Sunshine Heights Primary School
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,25.9,18779,3161,8047,3343,2254,1974,"(-37.7975155, 144.7906544)","(-37.788396, 144.798996)",Ardeer South Primary School
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,25.9,18779,3161,8047,3343,2254,1974,"(-37.7975155, 144.7906544)","(-37.788396, 144.798996)",Ardeer South Primary School
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98180,4.0,2.0,2.0,House,"6 RUSSELL STREET, TEESDALE",Teesdale,3328.0,https://www.oldlistings.com.au/real-estate/VIC...,"Russell Street, Teesdale, Golden Plains Shire,...",-38.025992,...,29.4,8065,1784,3116,1144,859,1162,"(-38.0259922, 144.0595107)","(-38.02923075, 144.0545907)",Teesdale Primary School
98181,4.0,2.0,2.0,House,"70 TEESDALE-LETHBRIDGE ROAD, TEESDALE",Teesdale,3328.0,https://www.oldlistings.com.au/real-estate/VIC...,"Teesdale-Lethbridge Road, Teesdale, Golden Pla...",-38.005393,...,29.4,8065,1784,3116,1144,859,1162,"(-38.0053932, 144.072826)","(-38.02923075, 144.0545907)",Teesdale Primary School
98182,3.0,1.0,4.0,House,"919 BANNOCKBURN -SHELFORD ROAD, TEESDALE",Teesdale,3328.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bannockburn-Shelford Road, Teesdale, Golden Pl...",-38.021889,...,29.4,8065,1784,3116,1144,859,1162,"(-38.0218894, 144.027219)","(-38.02923075, 144.0545907)",Teesdale Primary School
98183,4.0,1.0,4.0,House,"715 TAYLOR ROAD, MEREDITH",Meredith,3333.0,https://www.oldlistings.com.au/real-estate/VIC...,"Taylor Road, Meredith, Golden Plains Shire, Vi...",-37.870210,...,29.4,8065,1784,3116,1144,859,1162,"(-37.8702095, 144.0960054)","(-37.84271017, 144.0778091)",Meredith Primary School


### embed population projection

In [34]:
population_projection_df = pd.read_csv('../data/raw/population_projection.csv')
population_projection_df = population_projection_df.loc[population_projection_df['SEX'] == 'Persons']
population_projection_df = population_projection_df.loc[population_projection_df['YEAR'] == 2027]
population_projection_df = population_projection_df.drop(['YEAR', 'SA2_NAME', 'SEX'], axis=1)
# they are categories as well with the previous population statistics
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_projection_df['proj_population_children'] = population_projection_df[['Age0-4', 'Age5-9', 'Age10-14']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age0-4', 'Age5-9', 'Age10-14'], axis=1)
population_projection_df['proj_population_early_working'] = population_projection_df[['Age15-19', 'Age20-24']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age15-19', 'Age20-24'], axis=1)
population_projection_df['proj_population_prime_working'] = population_projection_df[['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54'], axis=1)
population_projection_df['proj_population_mature_working'] = population_projection_df[['Age55-59', 'Age60-64']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age55-59', 'Age60-64'], axis=1)
population_projection_df['proj_population_elderly'] = population_projection_df[['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+'], axis=1)
population_projection_df = population_projection_df.rename(columns={'Total': 'proj_population_total', 'SA2_CODE': 'SA2'}, errors="raise")
population_projection_df.head()

Unnamed: 0,SA2,proj_population_total,proj_population_children,proj_population_early_working,proj_population_prime_working,proj_population_mature_working,proj_population_elderly
13862,201011001,18611,4389,2496,7275,1807,2644
13865,201011002,12252,1788,1796,3940,1674,3054
13868,201011003,26630,5094,2777,9809,3270,5680
13871,201011004,28423,4849,3734,11347,3056,5437
13874,201011005,8900,1919,1368,2966,1067,1580


In [35]:
df3 = pd.merge(df3, population_projection_df, on='SA2')
df3.head()

Unnamed: 0,bed,bath,car,type,address,suburb,postcode,url,loc_address,lat,...,population_mature_working,geo_coordinate,closest_school_loc,school_name,proj_population_total,proj_population_children,proj_population_early_working,proj_population_prime_working,proj_population_mature_working,proj_population_elderly
0,3.0,1.0,1.0,,"3/85 RIDGEWAY PARADE, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ridgeway Parade, Sunshine West, Melbourne, Cit...",-37.783822,...,1974,"(-37.7838225, 144.8075373)","(-37.787534, 144.807516)",St Paul's School,23815,4585,2889,10209,2205,3927
1,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,1974,"(-37.7880126, 144.8152311)","(-37.79222, 144.818159)",Sunshine Heights Primary School,23815,4585,2889,10209,2205,3927
2,4.0,1.0,1.0,House,"1 AINSWORTH STREET, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Ainsworth Street, Sunshine West, Melbourne, Ci...",-37.788013,...,1974,"(-37.7880126, 144.8152311)","(-37.79222, 144.818159)",Sunshine Heights Primary School,23815,4585,2889,10209,2205,3927
3,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,1974,"(-37.7975155, 144.7906544)","(-37.788396, 144.798996)",Ardeer South Primary School,23815,4585,2889,10209,2205,3927
4,4.0,2.0,3.0,House,"1 BOTTLEBRUSH COURT, SUNSHINE WEST",Sunshine West,3020.0,https://www.oldlistings.com.au/real-estate/VIC...,"Bottlebrush Court, Sunshine West, Melbourne, C...",-37.797516,...,1974,"(-37.7975155, 144.7906544)","(-37.788396, 144.798996)",Ardeer South Primary School,23815,4585,2889,10209,2205,3927


In [36]:
df3.to_csv('../data/curated/listing_with_features.csv')