-------
# Preprocess 1
1. Merge API request data to property data
2. Preprocess property data
3. Find property's suburb and SA2
-------

### Merge API data to property data

In [3]:
import numpy as np
import pandas as pd
import json
import seaborn as sns
import geopandas as gpd
from geopandas.tools import sjoin

1. load dataset

In [4]:
pd.options.mode.chained_assignment = None
# Read the data

df = pd.read_json('../data/raw/property_raw.json')
df = df.transpose()
df = df.reset_index(drop=True)
df

Unnamed: 0,name,cost_text,coordinates,rooms,type
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[-37.8318086, 144.9710272]","[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[-37.816228, 144.9532465]","[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[-37.8121026, 144.9581291]","[1 Bed, 1 Bath]",Apartment / Unit / Flat
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[-37.8147317, 144.9733655]","[1 Bed, 1 Bath]",Apartment / Unit / Flat
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[-37.8125979, 144.9604012]","[1 Bed, 1 Bath]",Apartment / Unit / Flat
...,...,...,...,...,...
15342,28 Beachcomber Drive Inverloch VIC 3996,$550 wk,"[-38.6412502, 145.7031282]","[4 Beds, 2 Baths]",House
15343,14 Inverloch Parade Inverloch VIC 3996,$420 wk,"[-38.6287143, 145.7358082]","[2 Beds, 1 Bath]",House
15344,10 Hopetoun Street Inverloch VIC 3996,$370 wk,"[-38.6341243, 145.7306862]","[2 Beds, 1 Bath, 1 Parking]",House
15345,Inverloch VIC 3996,$475 wk,"[-38.6314613, 145.7293638]","[3 Beds, 2 Baths, 1 Parking]",House


#### Merge the distance to school, station and CBD
1. Merge School distance
2. Merge station distance
3. Merge CBD distance

In [5]:
dist_train = pd.read_csv("../data/raw/properties_train_proximity.csv")
dist_school = pd.read_csv("../data/raw/properties_shcool_proximity.csv")
dist_CBD = pd.read_csv("../data/raw/distance_to_CBD.csv")

In [6]:
df["cloest station"] = dist_train['closest station'].map(lambda x : str(x))
df["station_distance"] = dist_train['proximity to train station']

df['cloest school'] = dist_school['closest school'].map(lambda x : str(x))
df["school_distance"] = dist_school['proximity to school']

df['CBD_distance'] = dist_CBD["CBD_distance"]

--------
### Preprocess property features
1. Split postcode and address
2. Unify cost measurement to weekly cost
3. Split 'rooms' to number of bedrooms, bathrooms and parking

In [7]:
# get postcode and address from name
ps_code = df['name']
ps_code1 = list(ps_code)
add = list(ps_code)
for i in range(len(ps_code1)-1):
    ps_code1[i] = ps_code1[i][-4:]
for i in range(len(add)-1):
    add[i] = add[i][:-9]
    
df['postcode'] = ps_code1
df['address'] = add
df

Unnamed: 0,name,cost_text,coordinates,rooms,type,cloest station,station_distance,cloest school,school_distance,CBD_distance,postcode,address
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[-37.8318086, 144.9710272]","[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat,30,1803.65,1374,667.06,1702.19,3000,1901/368 St Kilda Road Melbourne
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[-37.816228, 144.9532465]","[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat,176,371.68,2265,352.49,2267.13,3000,1211/200 Spencer Street Melbourne
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[-37.8121026, 144.9581291]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,248.50,2189,401.73,1937.39,3000,1008/380 Little Lonsdale Street Melbourne
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[-37.8147317, 144.9733655]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,140,404.26,2193,586.61,790.13,3000,3/27 Flinders Lane Melbourne
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[-37.8125979, 144.9604012]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,525.08,2265,779.91,1657.95,3000,611/408 Lonsdale Street Melbourne
...,...,...,...,...,...,...,...,...,...,...,...,...
15342,28 Beachcomber Drive Inverloch VIC 3996,$550 wk,"[-38.6412502, 145.7031282]","[4 Beds, 2 Baths]",House,181,65073.96,447,3338.25,146939.64,3996,28 Beachcomber Drive Inverloch
15343,14 Inverloch Parade Inverloch VIC 3996,$420 wk,"[-38.6287143, 145.7358082]","[2 Beds, 1 Bath]",House,181,69323.16,447,944.91,147493.50,3996,14 Inverloch Parade Inverloch
15344,10 Hopetoun Street Inverloch VIC 3996,$370 wk,"[-38.6341243, 145.7306862]","[2 Beds, 1 Bath, 1 Parking]",House,181,68313.03,447,516.17,147082.53,3996,10 Hopetoun Street Inverloch
15345,Inverloch VIC 3996,$475 wk,"[-38.6314613, 145.7293638]","[3 Beds, 2 Baths, 1 Parking]",House,181,68459.23,447,68.42,146682.64,3996,Inverloch


In [8]:
# extract cost value
# Remove the comma
df['cost_text'] = df.cost_text.astype(str).str.replace(r',', '')

import numpy as np
df['cost'] = df.cost_text.str.extract(r"\$(\d*\.?\d+)", expand=True)
df=df.dropna()
df['cost'] = pd.to_numeric(df['cost'])
df['cost'] = df['cost'].astype(int)
df = df.reset_index(drop=True)

df['text'] = df.cost_text.str.extract("\d\s*([a-zA-Z]+\s*[a-zA-Z]+\s*[a-zA-Z]*)")

# Turn monthly cost into weekly cost
df.cost[(df['cost_text'].str.contains('w|W|/W')==False) & (df['text'].str.contains("month|pcm|pm|PCM|mth|Month")) &(df['text'].isnull() == False)] /= 4.34
# Turn fornightly cost into weekly cost
df.cost[df['cost_text'].str.contains("Fortnight|fornight")&(df['text'].isnull() == False)]/=2
# Turn annual cost into weekly cost
df.cost[(df['cost_text'].str.contains("Night|night") == False) & (df['cost_text'].str.contains("Fortnight|fornight|Fortnightly")== False) &
(df['cost_text'].str.contains('w|W|/W')==False) & df['cost_text'].str.contains("year|Year|annum|pa|Annual|p\.a")] /= 52
# Turn day by day cost into weekly cost
df.cost[(df['cost_text'].str.contains("Fortnight|fornight") == False) & df['cost_text'].str.contains("Night|night")&(df['text'].isnull() == False)] *= 7

# Correct a few miswritten lines in 'cost_text'
df.cost.iloc[[4389,4527,6018,6035,8108,9619]] *= 4.34
df = df.reset_index(drop=True)
df = df.drop([10364,11847])

# Keep the cost less than $10000
df = df[df['cost']<10000]

df.cost.describe()


count    14750.000000
mean       514.283754
std        274.110505
min          1.000000
25%        385.000000
50%        450.000000
75%        560.000000
max       5600.000000
Name: cost, dtype: float64

In [9]:
# extract postcode frome the feature 'name'
df['postcode'] = df['name'].apply(lambda x:x.split(' ')[-1])
df = df.reset_index(drop=True)
df

Unnamed: 0,name,cost_text,coordinates,rooms,type,cloest station,station_distance,cloest school,school_distance,CBD_distance,postcode,address,cost,text
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[-37.8318086, 144.9710272]","[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat,30,1803.65,1374,667.06,1702.19,3000,1901/368 St Kilda Road Melbourne,1800.0,Per Week
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[-37.816228, 144.9532465]","[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat,176,371.68,2265,352.49,2267.13,3000,1211/200 Spencer Street Melbourne,480.0,per week
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[-37.8121026, 144.9581291]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,248.50,2189,401.73,1937.39,3000,1008/380 Little Lonsdale Street Melbourne,400.0,pw
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[-37.8147317, 144.9733655]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,140,404.26,2193,586.61,790.13,3000,3/27 Flinders Lane Melbourne,420.0,per week
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[-37.8125979, 144.9604012]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,525.08,2265,779.91,1657.95,3000,611/408 Lonsdale Street Melbourne,350.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14745,16B Sandy Mount Avenue Inverloch VIC 3996,$400.00 per week,"[-38.6340011, 145.725239]","[2 Beds, 1 Bath, 1 Parking]",House,181,67815.19,447,596.78,146677.31,3996,16B Sandy Mount Avenue Inverloch,400.0,per week
14746,28 Beachcomber Drive Inverloch VIC 3996,$550 wk,"[-38.6412502, 145.7031282]","[4 Beds, 2 Baths]",House,181,65073.96,447,3338.25,146939.64,3996,28 Beachcomber Drive Inverloch,550.0,wk
14747,14 Inverloch Parade Inverloch VIC 3996,$420 wk,"[-38.6287143, 145.7358082]","[2 Beds, 1 Bath]",House,181,69323.16,447,944.91,147493.50,3996,14 Inverloch Parade Inverloch,420.0,wk
14748,10 Hopetoun Street Inverloch VIC 3996,$370 wk,"[-38.6341243, 145.7306862]","[2 Beds, 1 Bath, 1 Parking]",House,181,68313.03,447,516.17,147082.53,3996,10 Hopetoun Street Inverloch,370.0,wk


In [10]:
# split rooms into different part
# count missing values in 'rooms'
list(df['rooms']).count([])
# remove missing value
df=df[df['rooms'].str.len()!=0]
df = df.reset_index(drop=True)
rooms =df['rooms']

bedroom = []
bath = []
parking = []

for i in range(len(rooms)):
    if (rooms[i][0][-4:] == 'Beds') or (rooms[i][0][-3:] == 'Bed'):
        bedroom.append(rooms[i][0][0])
       
for j in range(len(rooms)):
    if (rooms[j][1][-5:] == 'Baths') or (rooms[j][1][-4:] == 'Bath'):
        bath.append(rooms[j][1][0])

for j in range(len(rooms)):
    if len(rooms[j]) != 3:
        parking.append('0')
    elif(rooms[j][2][2:] == 'Parking'):
        parking.append(rooms[j][2][0])

df['beds'] = bedroom
df['bath'] = bath
df['parking'] = parking
df

Unnamed: 0,name,cost_text,coordinates,rooms,type,cloest station,station_distance,cloest school,school_distance,CBD_distance,postcode,address,cost,text,beds,bath,parking
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[-37.8318086, 144.9710272]","[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat,30,1803.65,1374,667.06,1702.19,3000,1901/368 St Kilda Road Melbourne,1800.0,Per Week,3,2,2
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[-37.816228, 144.9532465]","[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat,176,371.68,2265,352.49,2267.13,3000,1211/200 Spencer Street Melbourne,480.0,per week,1,1,1
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[-37.8121026, 144.9581291]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,248.50,2189,401.73,1937.39,3000,1008/380 Little Lonsdale Street Melbourne,400.0,pw,1,1,0
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[-37.8147317, 144.9733655]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,140,404.26,2193,586.61,790.13,3000,3/27 Flinders Lane Melbourne,420.0,per week,1,1,0
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[-37.8125979, 144.9604012]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,525.08,2265,779.91,1657.95,3000,611/408 Lonsdale Street Melbourne,350.0,,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14721,16B Sandy Mount Avenue Inverloch VIC 3996,$400.00 per week,"[-38.6340011, 145.725239]","[2 Beds, 1 Bath, 1 Parking]",House,181,67815.19,447,596.78,146677.31,3996,16B Sandy Mount Avenue Inverloch,400.0,per week,2,1,1
14722,28 Beachcomber Drive Inverloch VIC 3996,$550 wk,"[-38.6412502, 145.7031282]","[4 Beds, 2 Baths]",House,181,65073.96,447,3338.25,146939.64,3996,28 Beachcomber Drive Inverloch,550.0,wk,4,2,0
14723,14 Inverloch Parade Inverloch VIC 3996,$420 wk,"[-38.6287143, 145.7358082]","[2 Beds, 1 Bath]",House,181,69323.16,447,944.91,147493.50,3996,14 Inverloch Parade Inverloch,420.0,wk,2,1,0
14724,10 Hopetoun Street Inverloch VIC 3996,$370 wk,"[-38.6341243, 145.7306862]","[2 Beds, 1 Bath, 1 Parking]",House,181,68313.03,447,516.17,147082.53,3996,10 Hopetoun Street Inverloch,370.0,wk,2,1,1


-------------
### Merge other features to property dataset
1. Find out property cooresponding SA2
2. Find out property cooresponding Suburb

##### Use geopands to map property to SA2 and Suburb

In [11]:
SA2_shape = gpd.read_file("../data/raw/external_data/SA2_shape/SA2_2021_AUST_GDA2020.shp")
suburb_shape = gpd.read_file("../data/raw/external_data/vic_localities/vic_localities.shp")
SA2_shape_2016 = gpd.read_file("../data/raw/external_data/SA2_shape_2016/SA2_2016_AUST.shp")

In [12]:
# Select certain rage of points
SA2_shape = SA2_shape[SA2_shape['STE_NAME21'] == 'Victoria']
SA2_shape = SA2_shape[["SA2_CODE21", "geometry"]]
suburb_shape = suburb_shape[['LOC_PID', 'LOC_NAME', 'geometry']]
SA2_shape_2016 = SA2_shape_2016[SA2_shape_2016['STE_NAME16'] == 'Victoria']
SA2_shape_2016 = SA2_shape_2016[["SA2_MAIN16", "geometry"]]

In [13]:
SA2_shape.head()

Unnamed: 0,SA2_CODE21,geometry
644,201011001,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,201011006,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,201011007,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."


In [14]:
SA2_shape_2016.head()

Unnamed: 0,SA2_MAIN16,geometry
578,201011001,"POLYGON ((143.70477 -37.51935, 143.70482 -37.5..."
579,201011002,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
580,201011003,"POLYGON ((143.85013 -37.54247, 143.85011 -37.5..."
581,201011004,"POLYGON ((143.82821 -37.57559, 143.82840 -37.5..."
582,201011005,"POLYGON ((143.84170 -37.61597, 143.84175 -37.6..."


In [15]:
# Convert property location to geopandas datatype POINT
df_geo = df
df_geo['Longitude'] = df['coordinates'].map(lambda x : x[1])
df_geo['Latitude'] = df['coordinates'].map(lambda x : x[0])
df_geo = gpd.GeoDataFrame(
        df_geo, geometry=gpd.points_from_xy(df_geo.Longitude, df_geo.Latitude))
df_geo.set_crs(epsg=7844, inplace=True)

Unnamed: 0,name,cost_text,coordinates,rooms,type,cloest station,station_distance,cloest school,school_distance,CBD_distance,postcode,address,cost,text,beds,bath,parking,Longitude,Latitude,geometry
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[-37.8318086, 144.9710272]","[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat,30,1803.65,1374,667.06,1702.19,3000,1901/368 St Kilda Road Melbourne,1800.0,Per Week,3,2,2,144.971027,-37.831809,POINT (144.97103 -37.83181)
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[-37.816228, 144.9532465]","[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat,176,371.68,2265,352.49,2267.13,3000,1211/200 Spencer Street Melbourne,480.0,per week,1,1,1,144.953247,-37.816228,POINT (144.95325 -37.81623)
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[-37.8121026, 144.9581291]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,248.50,2189,401.73,1937.39,3000,1008/380 Little Lonsdale Street Melbourne,400.0,pw,1,1,0,144.958129,-37.812103,POINT (144.95813 -37.81210)
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[-37.8147317, 144.9733655]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,140,404.26,2193,586.61,790.13,3000,3/27 Flinders Lane Melbourne,420.0,per week,1,1,0,144.973365,-37.814732,POINT (144.97337 -37.81473)
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[-37.8125979, 144.9604012]","[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,525.08,2265,779.91,1657.95,3000,611/408 Lonsdale Street Melbourne,350.0,,1,1,0,144.960401,-37.812598,POINT (144.96040 -37.81260)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14721,16B Sandy Mount Avenue Inverloch VIC 3996,$400.00 per week,"[-38.6340011, 145.725239]","[2 Beds, 1 Bath, 1 Parking]",House,181,67815.19,447,596.78,146677.31,3996,16B Sandy Mount Avenue Inverloch,400.0,per week,2,1,1,145.725239,-38.634001,POINT (145.72524 -38.63400)
14722,28 Beachcomber Drive Inverloch VIC 3996,$550 wk,"[-38.6412502, 145.7031282]","[4 Beds, 2 Baths]",House,181,65073.96,447,3338.25,146939.64,3996,28 Beachcomber Drive Inverloch,550.0,wk,4,2,0,145.703128,-38.641250,POINT (145.70313 -38.64125)
14723,14 Inverloch Parade Inverloch VIC 3996,$420 wk,"[-38.6287143, 145.7358082]","[2 Beds, 1 Bath]",House,181,69323.16,447,944.91,147493.50,3996,14 Inverloch Parade Inverloch,420.0,wk,2,1,0,145.735808,-38.628714,POINT (145.73581 -38.62871)
14724,10 Hopetoun Street Inverloch VIC 3996,$370 wk,"[-38.6341243, 145.7306862]","[2 Beds, 1 Bath, 1 Parking]",House,181,68313.03,447,516.17,147082.53,3996,10 Hopetoun Street Inverloch,370.0,wk,2,1,1,145.730686,-38.634124,POINT (145.73069 -38.63412)


In [16]:
# Find property suburb and SA2
df_geo_SA2 = sjoin(df_geo, SA2_shape, how="left")
df_geo_SA2 = df_geo_SA2.drop(columns="index_right")
df_geo_SA2_suburb = sjoin(df_geo_SA2, suburb_shape, how='left')
df_geo_SA2_suburb = df_geo_SA2_suburb.drop(columns="index_right")
SA2_shape_2016.set_crs(epsg=7844, inplace=True, allow_override=True)
df_geo_SA2_suburb_16= sjoin(df_geo_SA2_suburb, SA2_shape_2016, how="left")
df_geo_SA2_suburb_16 = df_geo_SA2_suburb_16.drop(columns=["index_right", 'geometry', 'coordinates'])

In [17]:
df_geo_SA2_suburb_16.head(5)

Unnamed: 0,name,cost_text,rooms,type,cloest station,station_distance,cloest school,school_distance,CBD_distance,postcode,...,text,beds,bath,parking,Longitude,Latitude,SA2_CODE21,LOC_PID,LOC_NAME,SA2_MAIN16
0,1901/368 St Kilda Road Melbourne VIC 3000,$1800 Per Week,"[3 Beds, 2 Baths, 2 Parking]",Apartment / Unit / Flat,30,1803.65,1374,667.06,1702.19,3000,...,Per Week,3,2,2,144.971027,-37.831809,206051512,loc9901d119afda,Melbourne,206051132
1,1211/200 Spencer Street Melbourne VIC 3000,$480 per week,"[1 Bed, 1 Bath, 1 Parking]",Apartment / Unit / Flat,176,371.68,2265,352.49,2267.13,3000,...,per week,1,1,1,144.953247,-37.816228,206041505,loc9901d119afda,Melbourne,206041122
2,1008/380 Little Lonsdale Street Melbourne VIC ...,$400 pw,"[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,248.5,2189,401.73,1937.39,3000,...,pw,1,1,0,144.958129,-37.812103,206041505,loc9901d119afda,Melbourne,206041122
3,3/27 Flinders Lane Melbourne VIC 3000,$420 per week,"[1 Bed, 1 Bath]",Apartment / Unit / Flat,140,404.26,2193,586.61,790.13,3000,...,per week,1,1,0,144.973365,-37.814732,206041503,loc9901d119afda,Melbourne,206041122
4,611/408 Lonsdale Street Melbourne VIC 3000,$350,"[1 Bed, 1 Bath]",Apartment / Unit / Flat,28,525.08,2265,779.91,1657.95,3000,...,,1,1,0,144.960401,-37.812598,206041504,loc9901d119afda,Melbourne,206041122


In [18]:
df_geo_SA2_suburb_16.to_csv("../data/raw/properties_preporcessed_1.csv")