In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
properties = pd.read_csv("../data/raw/property_data.csv", index_col=0)
properties.head()


Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,5408/500 Elizabeth Street Melbourne VIC 3000,440,"[-37.8072443, 144.9602814]",1,1,−,Apartment / Unit / Flat,BRADY residential,3000.0
1,502/118 Russell Street Melbourne VIC 3000,620,"[-37.8135864, 144.9687232]",1,1,−,Apartment / Unit / Flat,Dingle Partners,3000.0
2,202A/441 Lonsdale Street Melbourne VIC 3000,300,"[-37.8134292, 144.9594445]",1,1,−,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000.0
3,57/243 Collins Street Melbourne VIC 3000,400,"[-37.8159969, 144.9657956]",1,1,−,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0


## Preprocessing

In [4]:
nan_bed = properties[properties['Bed'] == '−']
nan_bath = properties[properties['Bath'] == '−']
nan_park = properties[properties['Parking'] == '−']
nan_postcode = properties[properties['Postcode'].isna()]
print(len(nan_bed))
print(len(nan_bath))
print(len(nan_park))
print(len(nan_postcode))

97
8
1894
42


In [5]:
# reformat costs without commas
properties['Cost'] = properties['Cost'].str.replace(',', '')

# change null values to numeric zero
properties['Bed'] = properties['Bed'].replace('−', 0)
properties['Bath'] = properties['Bath'].replace('−', 1)
properties['Parking'] = properties['Parking'].replace('−', 0)

In [6]:
# Null postcodes are invalid rows
properties = properties[~properties['Postcode'].isna() ]

In [7]:
# remove [0, 0] coordinates 
properties = properties[properties['Coordinates']!='[0.0, 0.0]']

In [8]:
# Convert cost to float type
properties['Cost'] = properties['Cost'].astype(float)
properties['Bed'] = properties['Bed'].astype(int)
properties['Bath'] = properties['Bath'].astype(int)
properties['Parking'] = properties['Parking'].astype(int)
properties['Postcode'] = properties['Postcode'].astype(int).astype(str)

properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,5408/500 Elizabeth Street Melbourne VIC 3000,440.0,"[-37.8072443, 144.9602814]",1,1,0,Apartment / Unit / Flat,BRADY residential,3000
1,502/118 Russell Street Melbourne VIC 3000,620.0,"[-37.8135864, 144.9687232]",1,1,0,Apartment / Unit / Flat,Dingle Partners,3000
2,202A/441 Lonsdale Street Melbourne VIC 3000,300.0,"[-37.8134292, 144.9594445]",1,1,0,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000
3,57/243 Collins Street Melbourne VIC 3000,400.0,"[-37.8159969, 144.9657956]",1,1,0,Apartment / Unit / Flat,Harcourts Melbourne City,3000
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625.0,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000


In [9]:
# zero cost properties
properties = properties[properties['Cost'] != 0]

In [10]:
properties['Cost'].sort_values(ascending=False).head(10)

13298    47000.0
1160     39000.0
9448     39000.0
7844     38000.0
8761      5000.0
13281     5000.0
13108     4000.0
10664     3850.0
10544     3775.0
6156      3750.0
Name: Cost, dtype: float64

In [11]:
properties = properties[properties['Cost'] <= 20000]

In [13]:
new = properties.reset_index(drop=True)
new.to_csv('../data/curated/properties_processed.csv')
new.to_json('../data/curated/properties_processed.json')

In [None]:
# filter property types
properties = properties[properties['Property_Type'] != 'Carspace']
properties = properties[properties['Property_Type'] != 'Block of Units']

properties = properties.replace(
    ['Penthouse', 'New Apartments / Off the Plan'], 
    ['Apartment / Unit / Flat', 'Townhouse'])

properties.groupby('Property_Type').sum()

## Merge Distance Datasets

In [15]:
relative_dir = "../data/curated/"

cbd_distance = pd.read_csv(relative_dir + "cbd_distance.csv", index_col=0)[['Name', 'CBD_distance']]

train = pd.read_csv(relative_dir + "closest_train.csv", index_col=0)[['Name', 'PTV']]

park = pd.read_csv(relative_dir + "closest_park.csv", index_col=0)[['Name', 'Park']]

postoffice = pd.read_csv(relative_dir + "closest_postoffice.csv", index_col=0)[['Name', 'Post_Office']]

school = pd.read_csv(relative_dir + "school_info.csv", index_col=0)[['Name', 'Nearby_Schools', 'Primary_Distance', 'Secondary_Distance']]

# postoffice = postoffice.rename({'Post_Office': 'Postoffice'}, axis = 1)

In [16]:
def seperate_dictionary(data, column):
    return pd.concat(
        [data.drop(column, axis=1), 
        data[column].map(eval).apply(pd.Series)]
    , axis=1)

In [17]:
# seperate cbd dictionary and reformat
cbd_distance = seperate_dictionary(cbd_distance, 'CBD_distance').drop(0, axis=1)
cbd_distance.rename({'distance':'cbd_distance', 'duration':'cbd_duration'}, inplace=True, axis=1)


In [18]:
# seperate train dictionary and reformat
train = seperate_dictionary(train, 'PTV')
train.rename({'distance':'station_distance', 'duration':'station_duration'}, inplace=True, axis=1)


In [19]:
# seperate number of park dictionary and reformat
park = seperate_dictionary(park, 'Park')
park.rename({'distance':'park_distance', 'duration':'park_duration'}, inplace=True, axis=1)


In [20]:
# seperate number of postoffice dictionary and reformat
postoffice = seperate_dictionary(postoffice, 'Post_Office')
postoffice.rename({'distance':'postoffice_distance', 'duration':'postoffice_duration'}, inplace=True, axis=1)

In [21]:
school = seperate_dictionary(school, 'Primary_Distance')
school.rename({'distance':'primary_distance', 'duration':'primary_duration'}, inplace=True, axis=1)
school = seperate_dictionary(school, 'Secondary_Distance')
school.rename({'distance':'secondary_distance', 'duration':'secondary_duration'}, inplace=True, axis=1)

In [24]:
print(len(new))
merged_distances = new.merge(cbd_distance, on = 'Name', how = 'inner').merge(train, on = 'Name', how = 'inner').merge(park, on = 'Name', how = 'inner').merge(postoffice, on = 'Name', how = 'inner').merge(school, on='Name', how='inner')
merged_distances = merged_distances.drop_duplicates(subset= new.columns, keep='first')

print(len(merged_distances))
merged_distances.head()


13221
13197


Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode,cbd_distance,...,park_distance,park_duration,0,postoffice_distance,postoffice_duration,Nearby_Schools,primary_distance,primary_duration,secondary_distance,secondary_duration
0,5408/500 Elizabeth Street Melbourne VIC 3000,440.0,"[-37.8072443, 144.9602814]",1,1,0,Apartment / Unit / Flat,BRADY residential,3000,749.2,...,423.1,68.7,,327.5,41.7,612,1511.1,158.2,923.1,105.6
64,502/118 Russell Street Melbourne VIC 3000,620.0,"[-37.8135864, 144.9687232]",1,1,0,Apartment / Unit / Flat,Dingle Partners,3000,951.3,...,470.2,65.5,,470.1,68.3,630,1652.9,183.0,438.8,68.5
65,202A/441 Lonsdale Street Melbourne VIC 3000,300.0,"[-37.8134292, 144.9594445]",1,1,0,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000,577.3,...,841.9,105.1,,557.1,71.1,612,2154.9,233.7,738.2,110.8
66,57/243 Collins Street Melbourne VIC 3000,400.0,"[-37.8159969, 144.9657956]",1,1,0,Apartment / Unit / Flat,Harcourts Melbourne City,3000,846.9,...,2019.2,294.8,,184.3,47.4,625,2424.5,256.7,1211.2,160.1
67,2311/601 Little Lonsdale Street Melbourne VIC ...,625.0,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000,1052.5,...,607.6,89.2,,324.1,62.8,608,1626.5,239.6,268.9,50.8


In [26]:
merged_distances = merged_distances.drop(columns = [0])
merged_distances = merged_distances.reset_index(drop=True)

In [27]:
merged_distances.to_csv('../data/curated/all_distances.csv')

In [28]:
merged_distances

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode,cbd_distance,...,station_duration,park_distance,park_duration,postoffice_distance,postoffice_duration,Nearby_Schools,primary_distance,primary_duration,secondary_distance,secondary_duration
0,5408/500 Elizabeth Street Melbourne VIC 3000,440.0,"[-37.8072443, 144.9602814]",1,1,0,Apartment / Unit / Flat,BRADY residential,3000,749.2,...,93.1,423.1,68.7,327.5,41.7,612,1511.1,158.2,923.1,105.6
1,502/118 Russell Street Melbourne VIC 3000,620.0,"[-37.8135864, 144.9687232]",1,1,0,Apartment / Unit / Flat,Dingle Partners,3000,951.3,...,126.5,470.2,65.5,470.1,68.3,630,1652.9,183.0,438.8,68.5
2,202A/441 Lonsdale Street Melbourne VIC 3000,300.0,"[-37.8134292, 144.9594445]",1,1,0,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000,577.3,...,74.2,841.9,105.1,557.1,71.1,612,2154.9,233.7,738.2,110.8
3,57/243 Collins Street Melbourne VIC 3000,400.0,"[-37.8159969, 144.9657956]",1,1,0,Apartment / Unit / Flat,Harcourts Melbourne City,3000,846.9,...,180.9,2019.2,294.8,184.3,47.4,625,2424.5,256.7,1211.2,160.1
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625.0,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000,1052.5,...,81.5,607.6,89.2,324.1,62.8,608,1626.5,239.6,268.9,50.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13192,16 Kenneth street Inverloch VIC 3996,470.0,"[-38.629687, 145.7322922]",3,2,1,House,Southcoast First National Inverloch,3996,148124.5,...,0.0,391.0,76.0,680.0,83.7,18,625.5,79.1,13615.8,858.0
13193,Inverloch VIC 3996,475.0,"[-38.6314613, 145.7293638]",3,2,1,House,Southcoast First National Inverloch,3996,147692.0,...,0.0,550.4,89.9,262.9,35.6,18,68.1,9.9,13183.2,805.3
13194,25A Veronica Street Inverloch VIC 3996,400.0,"[-38.6428993, 145.7101579]",3,1,1,House,Stockdale & Leggo Inverloch,3996,148353.5,...,0.0,891.6,206.5,0.0,0.0,15,2671.9,239.7,13844.7,917.5
13195,16B Sandy Mount Avenue Inverloch VIC 3996,400.0,"[-38.6340011, 145.725239]",2,1,1,House,Alex Scott & Staff Inverloch,3996,147686.6,...,0.0,146.5,13.7,942.7,98.9,18,618.6,54.6,13177.9,807.1
