Feature engineering

In [None]:
# this notebook does feature engineering from the pre-processed dataset
# features are embeded by appending columns to each records

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import geopy
import collections
import geopandas as gpd
from scipy.spatial.distance import cdist

In [None]:
# read in the scraped and pre-processed past listing data
files_dire = '../data/curated/'
df = pd.read_csv(f'{files_dire}/processed_listing.csv').iloc[: , 1:]
# sort the dataframe so the same property are followed with sorted list date
df = df.sort_values(['address', 'list_date'], ascending=[True, False]).reset_index().iloc[:,1:]
df

In [None]:
# original size of the dataframe
len(df)

In [None]:
# convert the date to a date format as it cannot be auto recognised by Pandas
df['list_date'] = df['list_date'].apply(pd.to_datetime)
# remove the data that has not been listed in 2021 or 2022 again
addresses = df.loc[df['list_date'].dt.year == 2022]['address'].tolist()
addresses = df.loc[df['list_date'].dt.year == 2021]['address'].tolist() + addresses
addresses = list(set(addresses))

In [None]:
# check the number of unique properties
len(addresses)

In [None]:
# size of the reduced dataframe
df = df.loc[df['address'].isin(addresses)]
len(df)

In [None]:
# store the number of entries for the loop below
n_rows = len(df)

In [None]:
# copy the first row of the dataframe to df2
# df2 will only store each property per row to replace the prvious dataframe
df2 = df.iloc[0:1,:]
# listing history will be stored in a new column as a list with the format of [date difference in years, price difference]
df2 = df2.assign(list_history = '')
df2.head()

In [None]:
# initialise for loop variables
n_property = 0
past_listing = []
list_date = df.iloc[0]['list_date']
list_price = df.iloc[0]['weekly_rent']

In [None]:
# run through all entries in the initial dataframe
for i in tqdm(range(1, n_rows)):
    # if the following row mataches with the previous one
    # calculate the date and price difference
    if df.iloc[i]['address'] == df2.iloc[n_property]['address']:
        days = (df.iloc[i]['list_date'] - list_date)/ np.timedelta64(1, 'Y')
        list_date = df.iloc[i]['list_date']
        price = (list_price - df.iloc[i]['weekly_rent']) / df.iloc[i]['weekly_rent']
        list_price = df.iloc[i]['weekly_rent']
        if days != 0:
            temp = [days, price]
            past_listing.append(temp)
    # if the address is different (different property)
    # the past listings are stored
    else:
        df2.at[n_property, 'list_history'] = past_listing
        n_property += 1
        df2.loc[n_property] = df.iloc[i]
        past_listing = []
        list_date = df.iloc[i]['list_date']
        list_price = df.iloc[i]['weekly_rent']
# manually store the last record
df2.at[n_property, 'list_history'] = past_listing

In [None]:
# a new column to store the number of times that the property has been listed for lease
df2['list_count'] = df2['list_history'].str.len()

In [None]:
# rename the column for legibility
df2 = df2.rename(columns={'code': 'postcode'}, errors="coerce")
# this file is stored to share with other group members
# df2.to_csv('../data/curated/processed_listing_combined.csv')

In [None]:
df2.head(20)

In [None]:
# compare with the previous output to make sure we have roughly the expected amount of output
len(df2)

In [None]:
# pivot to the number of past listings
col_list = (df2['list_count'].values.tolist())
col_list.sort()
counter = collections.Counter(col_list)
print(counter)
# it means there are 3xxxx properties that have been listed once in the past, etc

### Embed SA2 area information to the existing dataframe

In [None]:
# read in the postcode information
postcode_df = pd.read_csv('../data/raw/abs/australian_postcodes.csv')
# only retain information that is relavent for faster running time
postcode_df = postcode_df.loc[postcode_df['state'] == 'VIC']
# rename columns to match dataframe so it can perform a 'vlookup' properly
postcode_df['locality'] = postcode_df['locality'].str.title()
postcode_df = postcode_df[['postcode', 'locality', 'SA2_NAME_2016', 'SA2_MAINCODE_2016', 'lgaregion']]
postcode_df = postcode_df.rename(columns={'locality': 'suburb'}, errors="raise")
postcode_df.head()

In [None]:
# df3 will be the final output dataframe
# the property informations are merged with its SA2 information by matching both postcode and suburb name
df3 = pd.merge(df2, postcode_df, on=['postcode', 'suburb'])
df3['SA2_MAINCODE_2016'] = df3['SA2_MAINCODE_2016'].astype(int)
df3 = df3.rename(columns={'SA2_MAINCODE_2016': 'SA2'}, errors="raise")
df3.head()

In [None]:
len(df3)

In [None]:
# metro_melb is a list of all LGAs of metropolitan melbourne
# that is defined https://liveinmelbourne.vic.gov.au/discover/melbourne-victoria/metropolitan-melbourne
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 'Glen Eira', 'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 'Maribyrnong', 'Maroondah', 'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 'Mornington Peninsula', 'Nillumbik', 'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea', 'Wyndham', 'Yarra', 'Yarra Ranges']

In [None]:
# only retain entries that are in metro melbourne as they are more related with the study
df3 = df3.loc[df3['lgaregion'].isin(metro_melb)]

In [None]:
# check the number of unique properties again
len(df3)

### Embed income data

In [None]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
income_df = pd.read_csv(f'{files_dire}/abs/income_distribution.csv')
# only retain information that will be used
income_df = income_df[['SA2', 'Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']]
income_df = income_df.replace(',','', regex=True)
cols = ['Earners', 'Median age of earners', 'Median', 'Mean', 'Top 10%']
income_df[cols] = income_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
income_df = income_df.rename(columns={'Earners': 'income_earner', 'Median age of earners': 'income_median_age', 'Median': 'income_median', 'Mean': 'income_mean', 'Top 10%': 'income_top_10_pct'}, errors="raise")
# information will be 'vlookup'ed by SA2 code, the data contains the number of earners in the SA2, median age of the earner
# median income of the SA2, average income of the SA2 and how much of the population are ranked 10 percent within whole Australia

In [None]:
# merge income information
df3 = pd.merge(df3, income_df, on='SA2')

### Embed population statistics

In [None]:
# the csv file was exported from MS Excel as Python's ability to read csv with weird formating is limited
population_df = pd.read_csv(f'{files_dire}/abs/population.csv')
population_df = population_df.iloc[: , 8:]
# age are grouped into following categories
# they are sumed to produce these columns
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_df['population_children'] = population_df[['0-4', '5–9', '10–14']].sum(axis = 1)
population_df = population_df.drop(['0-4', '5–9', '10–14'], axis=1)
population_df['population_prime_working'] = population_df[['25–29', '30–34', '35–39', '40–44', '45–49', '50–54']].sum(axis = 1)
population_df = population_df.drop(['25–29', '30–34', '35–39', '40–44', '45–49', '50–54'], axis=1)
population_df['population_elderly'] = population_df[['65–69', '70–74', '75–79', '80–84', '85 and over']].sum(axis = 1)
population_df = population_df.drop(['65–69', '70–74', '75–79', '80–84', '85 and over'], axis=1)
population_df['population_early_working'] = population_df.iloc[:,2] + population_df.iloc[:,3]
population_df['population_mature_working'] = population_df.iloc[:,4] + population_df.iloc[:,5]
population_df = population_df.drop(['15–19', '20–24', '55–59', '60–64', 'SA2 name'], axis=1)
population_df = population_df.rename(columns={'Total persons': 'population_total', 'SA2 code': 'SA2'}, errors="raise")

In [None]:
df3 = pd.merge(df3, population_df, on='SA2')

### Embed school locations

In [None]:
school_df = pd.read_csv('../data/raw/schools.csv')
# drop useless columns to improve efficiency
school_df = school_df.drop(['Entity_Type', 'School_No', 'School_Status', 'Address_Line_1', 'Address_Line_2',
         'Address_Town', 'Address_State', 'Address_Postcode',
         'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Town',
         'Postal_State', 'Postal_Postcode', 'Full_Phone_No', 'LGA_ID',
         'LGA_Name'], axis=1)
school_df['geo_coordinate'] = [(x, y) for x,y in zip(school_df['Y'], school_df['X'])]
# only retain public schools
school_df = school_df.loc[school_df['Education_Sector']=='Government']

In [None]:
# this is adapted from stackoverflow
# https://codereview.stackexchange.com/questions/28207/finding-the-closest-point-to-a-list-of-points
# in the future, we can potentially find 3-5 closest schools and run API find find the truly closest one
# sometimes proximity is not equivalent to closest 
# e.g. 1.5km and 1.7km, but driving to the latter one is faster
# this thought also applies to hospitals, train stations
# it is a result of limited usuage of API
def closest_point(point, points):
    # Find closest point from a list of points.
    return points[cdist([point], points).argmin()]
def match_value(df, col1, x, col2):
    # Match value x from col1 row to value in col2.
    return df[df[col1] == x][col2].values[0]

In [None]:
df3['geo_coordinate'] = [(x, y) for x,y in zip(df3['lat'], df3['lon'])]

##### primary schools

In [None]:
school_type = ['Primary', 'Pri/Sec']
school_primary_df = school_df.loc[school_df['School_Type'].isin(school_type)]

In [None]:
# primary school
df3['closest_primary_school_loc'] = tqdm([closest_point(x, list(school_primary_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['primary_school_name'] = tqdm([match_value(school_primary_df, 'geo_coordinate', x, 'School_Name') for x in df3['closest_primary_school_loc']])

##### secondary schools

In [None]:
school_type = ['Secondary', 'Pri/Sec']
school_secondary_df = school_df.loc[school_df['School_Type'].isin(school_type)]

In [None]:
# embed secondary school ranking information
# considering some family may rent a property for its free education
school_ranking = pd.read_html('https://bettereducation.com.au/Results/vcePublicSchoolResults.aspx')[-1]
school_ranking = school_ranking[['Better Education Rank', 'School', 'Unit 3-4 cohort']]
school_ranking = school_ranking.rename(columns={'School': 'School_Name', 'Better Education Rank': 'secondary_school_rank', 'Unit 3-4 cohort': 'secondary_cohort'}, errors="coerce")
school_secondary_df = pd.merge(school_secondary_df, school_ranking, on='School_Name')

In [None]:
df3['closest_secondary_school_loc'] = tqdm([closest_point(x, list(school_secondary_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['secondary_school_name'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'School_Name') for x in df3['closest_secondary_school_loc']])
df3['secondary_school_rank'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'secondary_school_rank') for x in df3['closest_secondary_school_loc']])
df3['secondary_school_cohort'] = tqdm([match_value(school_secondary_df, 'geo_coordinate', x, 'secondary_cohort') for x in df3['closest_secondary_school_loc']])

In [None]:
# geocode are stored in the format of '(***.*****, ***.*****)'
# we need to split them for the direction API used later
# split them by the comma in the middle
df3['closest_primary_school_loc'] = df3['closest_primary_school_loc'].astype('str')
df3['closest_secondary_school_loc'] = df3['closest_secondary_school_loc'].astype('str')

In [None]:
df3[['pri_lat', 'pri_lon']] = df3['closest_primary_school_loc'].str.split(', ', 1, expand=True)
df3[['sec_lat', 'sec_lon']] = df3['closest_secondary_school_loc'].str.split(',', 1, expand=True)

In [None]:
# remove bracket and turn lat/lon into floats
bracket_remove = ['pri_lat', 'pri_lon', 'sec_lat', 'sec_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')

In [None]:
df3 = df3.drop(['closest_primary_school_loc', 'closest_secondary_school_loc'], axis = 1)

### embed population projection

In [None]:
population_projection_df = pd.read_csv('../data/raw/population_projection.csv')
population_projection_df = population_projection_df.loc[population_projection_df['SEX'] == 'Persons']
population_projection_df = population_projection_df.loc[population_projection_df['YEAR'] == 2027]
population_projection_df = population_projection_df.drop(['YEAR', 'SA2_NAME', 'SEX'], axis=1)
# they are categories as well with the previous population statistics
# 0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age), 65 years and over (elderly)
population_projection_df['proj_population_children'] = population_projection_df[['Age0-4', 'Age5-9', 'Age10-14']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age0-4', 'Age5-9', 'Age10-14'], axis=1)
population_projection_df['proj_population_early_working'] = population_projection_df[['Age15-19', 'Age20-24']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age15-19', 'Age20-24'], axis=1)
population_projection_df['proj_population_prime_working'] = population_projection_df[['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age25-29', 'Age30-34', 'Age35-39', 'Age40-44', 'Age45-49', 'Age50-54'], axis=1)
population_projection_df['proj_population_mature_working'] = population_projection_df[['Age55-59', 'Age60-64']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age55-59', 'Age60-64'], axis=1)
population_projection_df['proj_population_elderly'] = population_projection_df[['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+']].sum(axis = 1)
population_projection_df = population_projection_df.drop(['Age65-69', 'Age70-74', 'Age75-79', 'Age80-84', 'Age85+'], axis=1)
population_projection_df = population_projection_df.rename(columns={'Total': 'proj_population_total', 'SA2_CODE': 'SA2'}, errors="raise")
population_projection_df.head()

In [None]:
df3 = pd.merge(df3, population_projection_df, on='SA2')

### hospitals

In [None]:
hospitals_df = pd.read_csv('../data/raw/hospitals.csv')
hospitals_df = hospitals_df.loc[hospitals_df['Emergency Capable'] == 'YES']
hospitals_df['full_address'] = hospitals_df['Location Address'] + ', ' +hospitals_df['Suburb'] + ', Victoria'
hospitals_df = hospitals_df[['Formal Name', 'full_address']]

In [None]:
# get the geocode of each hospital
# it is highly similar to schools 
temp = []
for i in tqdm(hospitals_df['full_address']):
    locator = geopy.Nominatim(user_agent="myGeocoder");
    location = locator.geocode(i,timeout=None);
    if location != None:
        info = [i, location.latitude, location.longitude]
        temp.append(info)

In [None]:
hospital_geo = pd.DataFrame(temp, columns=['full_address', 'lat', 'lon'])
hospitals_df = pd.merge(hospitals_df, hospital_geo, on=['full_address'])
hospitals_df = hospitals_df.drop('full_address', axis=1)
hospitals_df['geo_coordinate'] = [(x, y) for x,y in zip(hospitals_df['lat'], hospitals_df['lon'])]

In [None]:
df3['closest_ed_loc'] = tqdm([closest_point(x, list(hospitals_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['closest_ed_name'] = tqdm([match_value(hospitals_df, 'geo_coordinate', x, 'Formal Name') for x in df3['closest_ed_loc']])

In [None]:
df3['closest_ed_loc'] = df3['closest_ed_loc'].astype('str')
df3[['ed_lat', 'ed_lon']] = df3['closest_ed_loc'].str.split(', ', 1, expand=True)

In [None]:
bracket_remove = ['ed_lat', 'ed_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')

In [None]:
df3 = df3.drop('closest_ed_loc', axis = 1)

#### train station

In [None]:
# read in the shape file
train_df = gpd.read_file('../data/raw/PTV/PTV_METRO_TRAIN_STATION.shp')
# get latitude and longtitude of each train station
train_df = train_df[['STOP_NAME', 'LATITUDE', 'LONGITUDE', 'ROUTEUSSP']]
train_df['geo_coordinate'] = [(x, y) for x, y in zip(train_df['LATITUDE'], train_df['LONGITUDE'])]
train_df['n_routes'] = train_df['ROUTEUSSP'].str.count(',') + 1
train_df = train_df.drop(['LONGITUDE', 'LATITUDE', 'ROUTEUSSP'], axis=1)

In [None]:
# embed train station information
# very similar to schools and hospitals above
df3['closest_train_loc'] = tqdm([closest_point(x, list(train_df['geo_coordinate'])) for x in df3['geo_coordinate']])
df3['train_stop'] = tqdm([match_value(train_df, 'geo_coordinate', x, 'STOP_NAME') for x in df3['closest_train_loc']])
df3['train_n_lines'] = tqdm([match_value(train_df, 'geo_coordinate', x, 'n_routes') for x in df3['closest_train_loc']])

In [None]:
df3['closest_train_loc'] = df3['closest_train_loc'].astype('str')
df3[['train_lat', 'train_lon']] = df3['closest_train_loc'].str.split(', ', 1, expand=True)

In [None]:
bracket_remove = ['train_lat', 'train_lon']
for i in bracket_remove:
    df3[i] = df3[i].str.strip('()')
df3[bracket_remove] = df3[bracket_remove].apply(pd.to_numeric, errors='coerce')
df3 = df3.drop('closest_train_loc', axis=1)

### final check and export

In [None]:
df3 = df3.drop('geo_coordinate', axis = 1)

In [None]:
df3.isna().sum()
# consider we still have a lot of entries
# and we want to produce the best model
# drop all rows with any NaN
# this shall not be applied if the dataset is small

In [None]:
df3['type'] =  df['type'].fillna('unknown')

In [None]:
df3[df3.isna().any(axis=1)]

In [None]:
# there were some geocode that are obvisouly outside of Victoria
# check if there is any
features = ['lat', 'lon', 'pri_lat', 'pri_lon', 'sec_lat', 'sec_lon', 'ed_lat', 'ed_lon', 'train_lat', 'train_lon']

In [None]:
# these numbers are roughly the boundries
# but it is good enough to remove the ones that are obvioulsy unreasonable
# shape file could be potentially used if the project is highly rigourous
df3 = df3[df3['lat'] < -34]
df3 = df3[df3['lat'] > -40]
df3 = df3[df3['lon'] > 140]
df3 = df3[df3['lon'] < 150]

In [None]:
len(df3)

In [None]:
# check the latitude and longtitude for the embeded features 
for i in features:
    print(i)
    print(df3[i].min())
    print(df3[i].max())

In [None]:
df3.to_csv('../data/curated/listing_with_features.csv')