In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
properties = pd.read_csv("../data/raw/property_data.csv", index_col=0)
properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,5408/500 Elizabeth Street Melbourne VIC 3000,440,"[-37.8072443, 144.9602814]",1,1,−,Apartment / Unit / Flat,BRADY residential,3000.0
1,502/118 Russell Street Melbourne VIC 3000,620,"[-37.8135864, 144.9687232]",1,1,−,Apartment / Unit / Flat,Dingle Partners,3000.0
2,202A/441 Lonsdale Street Melbourne VIC 3000,300,"[-37.8134292, 144.9594445]",1,1,−,Apartment / Unit / Flat,Biggin & Scott Stonnington,3000.0
3,57/243 Collins Street Melbourne VIC 3000,400,"[-37.8159969, 144.9657956]",1,1,−,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0
4,2311/601 Little Lonsdale Street Melbourne VIC ...,625,"[-37.8137564, 144.9537143]",2,2,1,Apartment / Unit / Flat,Harcourts Melbourne City,3000.0


## Preprocessing
- 1 property_type is actually a Carspace
- No bathrooms actually have 1 bathroom

In [3]:
nan_bed = properties[properties['Bed'] == '−']
nan_bath = properties[properties['Bath'] == '−']
nan_park = properties[properties['Parking'] == '−']
nan_postcode = properties[properties['Postcode'].isna()]
print(len(nan_bed))
print(len(nan_bath))
print(len(nan_park))
print(len(nan_postcode))

97
8
1894
42


In [4]:
# reformat costs without commas
properties['Cost'] = properties['Cost'].str.replace(',', '')

# change null values to numeric zero
properties['Bed'] = properties['Bed'].replace('−', 0)
properties['Bath'] = properties['Bath'].replace('−', 1)
properties['Parking'] = properties['Parking'].replace('−', 0)

In [5]:
# Null postcodes are invalid rows
properties = properties[~properties['Postcode'].isna() ]

In [6]:
# remove [0, 0] coordinates 
properties = properties[properties['Coordinates']!='[0.0, 0.0]']

In [7]:
# Convert cost to float type
properties['Cost'] = properties['Cost'].astype(float)
properties['Bed'] = properties['Bed'].astype(int)
properties['Bath'] = properties['Bath'].astype(int)
properties['Parking'] = properties['Parking'].astype(int)
properties['Postcode'] = properties['Postcode'].astype(int).astype(str)

properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode


In [8]:
# zero cost properties
properties = properties[properties['Cost'] != 0]

In [9]:
properties['Cost'].sort_values(ascending=False).head(10)

Series([], Name: Cost, dtype: float64)

In [10]:
properties = properties[properties['Cost'] <= 20000]

In [11]:
new = properties.reset_index(drop=True)
new.to_csv('../data/curated/properties_processed.csv')
new.to_json('../data/curated/properties_processed.json')