In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
properties = pd.read_csv("../data/raw/property_data.csv", index_col=0)
properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,5408/500 Elizabeth Street Melbourne VIC 3000,440,"[-37.8072443, 144.9602814]",1,1,−,Apartment / Unit / Flat,BRADY residential,3000.0
1,502/118 Russell Street Melbourne VIC 3000,620,"[-37.8135864, 144.9687232]",1,1,−,Apartment / Unit / Flat,Dingle Partners,3000.0
2,202A/441 Lonsdale Street Melbourne VIC 3000,300,"[-37.8134292, 144.9594445]",1,1,−,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000.0
3,57/243 Collins Street Melbourne VIC 3000,400,"[-37.8159969, 144.9657956]",1,1,−,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0


## Merge Distance Datasets

In [18]:
relative_dir = "../data/curated/"

cbd_distance = pd.read_csv(relative_dir + "cbd_distance.csv", index_col=0)[['Name', 'CBD_distance']]

train = pd.read_csv(relative_dir + "closest_train.csv", index_col=0)[['Name', 'PTV']]
# train = train.rename({'PTV':'PTV_distance'}, axis=1)

n_ptv = pd.read_csv(relative_dir + "num_ptv.csv", index_col=0)[['Name', 'PTV']]
# n_ptv = n_ptv.rename({'PTV':'PTV_count'}, axis=1)

park = pd.read_csv(relative_dir + "closest_park.csv", index_col=0)[['Name', 'Park']]

postoffice = pd.read_csv(relative_dir + "closest_postoffice.csv", index_col=0)[['Name', 'Post_Office']]
# postoffice = postoffice.rename({'Post_Office': 'Postoffice'}, axis = 1)

In [19]:
def seperate_dictionary(data, column):
    return pd.concat(
        [data.drop(column, axis=1), 
        data[column].map(eval).apply(pd.Series)]
    , axis=1)

In [20]:
# seperate cbd dictionary and reformat
cbd_distance = seperate_dictionary(cbd_distance, 'CBD_distance').drop(0, axis=1)
cbd_distance.rename({'distance':'cbd_distance', 'duration':'cbd_duration'}, inplace=True, axis=1)

In [21]:
# seperate train dictionary and reformat
train = seperate_dictionary(train, 'PTV')
train.rename({'distance':'station_distance', 'duration':'station_duration'}, inplace=True, axis=1)

In [22]:
# seperate number of ptv dictionary and reformat
n_ptv = seperate_dictionary(n_ptv, 'PTV')
n_ptv.rename({0:'ptv_1', 1:'ptv_2', 2:'ptv_3'}, inplace=True, axis=1)

In [23]:
# seperate number of park dictionary and reformat
park = seperate_dictionary(park, 'Park')
park.rename({'distance':'park_distance', 'duration':'park_duration'}, inplace=True, axis=1)

In [24]:
# seperate number of postoffice dictionary and reformat
postoffice = seperate_dictionary(postoffice, 'Post_Office')
postoffice.rename({'distance':'postoffice_distance', 'duration':'postoffice_duration'}, inplace=True, axis=1)

In [31]:
distances = cbd_distance.merge(train, on='Name').merge(n_ptv, on='Name').merge(park, on = 'Name').merge(postoffice, on = 'Name')

In [32]:
distances.groupby('cbd_duration').count()

Unnamed: 0_level_0,Name,cbd_distance,station_distance,station_duration,ptv_1,ptv_2,ptv_3,park_distance,park_duration,0,postoffice_distance,postoffice_duration
cbd_duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12.0,1,1,1,1,1,1,1,1,1,0,1,1
74.4,1,1,1,1,1,1,1,1,1,0,1,1
74.5,1,1,1,1,1,1,1,1,1,0,1,1
80.0,1,1,1,1,1,1,1,1,1,0,1,1
85.0,1,1,1,1,1,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
22750.7,1,1,1,1,0,0,0,1,1,0,1,1
22776.6,1,1,1,1,1,0,0,1,1,0,1,1
22781.8,1,1,1,1,1,1,1,1,1,0,1,1
22782.1,1,1,1,1,1,0,0,1,1,0,1,1


In [34]:
distances = distances.drop(columns = [0])

In [35]:
distances.to_csv('../data/curated/all_distances.csv')

## Preprocessing

In [36]:
nan_bed = properties[properties['Bed'] == '−']
nan_bath = properties[properties['Bath'] == '−']
nan_park = properties[properties['Parking'] == '−']
nan_postcode = properties[properties['Postcode'].isna()]
print(len(nan_bed))
print(len(nan_bath))
print(len(nan_park))
print(len(nan_postcode))

97
8
1894
42


In [37]:
# reformat costs without commas
properties['Cost'] = properties['Cost'].str.replace(',', '')

# change null values to numeric zero
properties['Bed'] = properties['Bed'].replace('−', 0)
properties['Bath'] = properties['Bath'].replace('−', 1)
properties['Parking'] = properties['Parking'].replace('−', 0)

In [38]:
# Null postcodes are invalid rows
properties = properties[~properties['Postcode'].isna() ]

In [39]:
# remove [0, 0] coordinates 
properties = properties[properties['Coordinates']!='[0.0, 0.0]']

In [40]:
# Convert cost to float type
properties['Cost'] = properties['Cost'].astype(float)
properties['Bed'] = properties['Bed'].astype(int)
properties['Bath'] = properties['Bath'].astype(int)
properties['Parking'] = properties['Parking'].astype(int)
properties['Postcode'] = properties['Postcode'].astype(int).astype(str)

properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,5408/500 Elizabeth Street Melbourne VIC 3000,440.0,"[-37.8072443, 144.9602814]",1,1,0,Apartment / Unit / Flat,BRADY residential,3000
1,502/118 Russell Street Melbourne VIC 3000,620.0,"[-37.8135864, 144.9687232]",1,1,0,Apartment / Unit / Flat,Dingle Partners,3000
2,202A/441 Lonsdale Street Melbourne VIC 3000,300.0,"[-37.8134292, 144.9594445]",1,1,0,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000
3,57/243 Collins Street Melbourne VIC 3000,400.0,"[-37.8159969, 144.9657956]",1,1,0,Apartment / Unit / Flat,Harcourts Melbourne City,3000
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625.0,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000


In [41]:
# zero cost properties
properties = properties[properties['Cost'] != 0]

In [42]:
properties['Cost'].sort_values(ascending=False).head(10)

13298    47000.0
1160     39000.0
9448     39000.0
7844     38000.0
8761      5000.0
13281     5000.0
13108     4000.0
10664     3850.0
10544     3775.0
6156      3750.0
Name: Cost, dtype: float64

In [43]:
properties = properties[properties['Cost'] <= 20000]

In [44]:
# filter property types
properties = properties[properties['Property_Type'] != 'Carspace']
properties = properties[properties['Property_Type'] != 'Block of Units']

properties = properties.replace(
    ['Penthouse', 'New Apartments / Off the Plan'], 
    ['Apartment / Unit / Flat', 'Townhouse'])

properties.groupby('Property_Type').count()

Unnamed: 0_level_0,Name,Cost,Coordinates,Bed,Bath,Parking,Agency,Postcode
Property_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Acreage / Semi-Rural,13,13,13,13,13,13,13,13
Apartment / Unit / Flat,4980,4980,4980,4980,4980,4980,4980,4980
Duplex,4,4,4,4,4,4,4,4
House,6613,6613,6613,6613,6613,6613,6613,6613
Rural,1,1,1,1,1,1,1,1
Semi-Detached,9,9,9,9,9,9,9,9
Studio,137,137,137,137,137,137,137,137
Terrace,9,9,9,9,9,9,9,9
Townhouse,1417,1417,1417,1417,1417,1417,1417,1417
Villa,38,38,38,38,38,38,38,38


In [45]:
new = properties.reset_index(drop=True)
new.to_csv('../data/curated/properties_processed.csv')
new.to_json('../data/curated/properties_processed.json')