In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
properties = pd.read_csv("../data/raw/property_data.csv", index_col=0)
properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,708/228 A'BECKETT STREET Melbourne VIC 3000,500,"[-37.8101911, 144.9566043]",2,1,−,Apartment / Unit / Flat,Elite Real Estate,3000.0
1,4/180 Little Collins Street Melbourne VIC 3000,500,"[-37.8138601, 144.9679067]",1,1,−,Apartment / Unit / Flat,Dingle Partners,3000.0
2,1605/565 Flinders Street Melbourne VIC 3000,500,"[-37.8210586, 144.9559072]",2,2,1,Apartment / Unit / Flat,Dingle Partners,3000.0
3,612/408 Lonsdale Street Melbourne VIC 3000,350,"[-37.8125979, 144.9604012]",1,1,−,Apartment / Unit / Flat,Gotham Property,3000.0
4,2108/288 Spencer Street Melbourne VIC 3000,350,"[-37.813775, 144.9520948]",1,1,−,Apartment / Unit / Flat,Motion Property,3000.0


## Preprocessing
- 1 property_type is actually a Carspace
- No bathrooms actually have 1 bathroom

In [92]:
nan_bed = properties[properties['Bed'] == '−']
nan_bath = properties[properties['Bath'] == '−']
nan_park = properties[properties['Parking'] == '−']
nan_postcode = properties[properties['Postcode'].isna()]
print(len(nan_bed))
print(len(nan_bath))
print(len(nan_park))
print(len(nan_postcode))

45
4
866
24


In [93]:
# reformat costs without commas
properties['Cost'] = properties['Cost'].str.replace(',', '')

# change null values to numeric zero
properties['Bed'] = properties['Bed'].replace('−', 0)
properties['Bath'] = properties['Bath'].replace('−', 1)
properties['Parking'] = properties['Parking'].replace('−', 0)

In [94]:
# find row/s with invalid costs and remove
space_count = properties['Cost'].str.count(' ')
invalid_rows = space_count[space_count != 0]
invalid_row = invalid_rows.index[0]
properties = properties[properties.index != invalid_row]

# zero cost properties
properties = properties[properties['Cost'] != 0]

# Null postcodes are invalid rows
properties = properties[~properties['Postcode'].isna() ]

In [99]:
# Convert cost to float type
properties['Cost'] = properties['Cost'].astype(float)
properties['Bed'] = properties['Bed'].astype(int)
properties['Bath'] = properties['Bath'].astype(int)
properties['Parking'] = properties['Parking'].astype(int)
properties['Postcode'] = properties['Postcode'].astype(int).astype(str)

properties.head()

Unnamed: 0,Name,Cost,Coordinates,Bed,Bath,Parking,Property_Type,Agency,Postcode
0,708/228 A'BECKETT STREET Melbourne VIC 3000,500.0,"[-37.8101911, 144.9566043]",2,1,0,Apartment / Unit / Flat,Elite Real Estate,3000
1,4/180 Little Collins Street Melbourne VIC 3000,500.0,"[-37.8138601, 144.9679067]",1,1,0,Apartment / Unit / Flat,Dingle Partners,3000
2,1605/565 Flinders Street Melbourne VIC 3000,500.0,"[-37.8210586, 144.9559072]",2,2,1,Apartment / Unit / Flat,Dingle Partners,3000
3,612/408 Lonsdale Street Melbourne VIC 3000,350.0,"[-37.8125979, 144.9604012]",1,1,0,Apartment / Unit / Flat,Gotham Property,3000
4,2108/288 Spencer Street Melbourne VIC 3000,350.0,"[-37.813775, 144.9520948]",1,1,0,Apartment / Unit / Flat,Motion Property,3000


In [106]:
properties['Cost'].sort_values(ascending=False).head(10)

3213    645000.0
7355     47000.0
425      39000.0
4576     39000.0
7338      5000.0
7170      4000.0
5383      3850.0
5268      3775.0
215       3750.0
4397      3500.0
Name: Cost, dtype: float64

In [107]:
properties = properties[properties['Cost'] <= 20000]

7338    5000.0
7170    4000.0
5383    3850.0
5268    3775.0
215     3750.0
4397    3500.0
3387    3500.0
2838    3500.0
2530    3500.0
4456    3000.0
Name: Cost, dtype: float64

In [110]:
properties.to_csv('../data/curated/properties_processed.csv')