# MAST30034 Project 2

## Data Pre-processing

In [3]:
import pandas as pd
import requests
import numpy as np
import re
import os
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
!pip install geopy
from geopy import distance

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 5.0 MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


### External Data

#### 1. SA2 Shapefile

In [4]:
sf = gpd.read_file("../data/raw/SA2/SA2_2021_AUST_GDA2020.shp")
# only leave useful cols
SA2_inf = ['SA2_CODE21', 'SA2_NAME21', 'geometry']
sf = sf[SA2_inf]
sf.crs = 'EPSG: 4326'
#sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

#### 2. Population

In [5]:
df_popu = pd.read_excel('../data/raw/population.xlsx', sheet_name='Table 1')
# Delete the rows that has Nan in Unnamed 1 column
df_popu = df_popu.dropna(subset=["Unnamed: 1"]) 

In [6]:
# Change the name of columns to the first row of the dataframe
import numpy as np
# Get the first row data
array = np.array(df_popu)
list = array.tolist()
list = list[0]

# Change the column name to the first row data
df_popu.columns = list
# Delete the first row 
df_popu.drop([6], inplace=True)

# Reset index
df_popu.reset_index(drop=True, inplace=True)

df_sub = df_popu.drop(['S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code', 
                  'SA4 name', 'SA3 code', 'SA3 name'], axis=1)
# Change the name of each columns 
df_sub.columns = ['SA2 code','SA2 name', '2001', '2002', '2003', '2004' , 
                  '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', 
                  '2014', '2015', '2016','2017', '2018', '2019', '2020', '2021','NaN','change 01-21', 
                  'change %', 'NaN', 'km2', 'persons/km2']

# Only keep the useful columns 
df_sub = df_sub[['SA2 code', 'SA2 name', '2021', 'change 01-21', 'change %']]
df_sub['SA2_code'] = df_sub['SA2 code']
df_sub['SA2_code'] = df_sub['SA2_code'].apply(str)

#### 3. Income

#### 4. PTV

In [7]:
stops_2 = pd.read_csv("../data/raw/ptv/2/stops.txt")
stops_2 = stops_2.loc[:,["stop_name","stop_lat","stop_lon"]]
stops_3 = pd.read_csv("../data/raw/ptv/3/stops.txt")
stops_3 = stops_3.loc[:,["stop_name","stop_lat","stop_lon"]]
stops_4 = pd.read_csv("../data/raw/ptv/4/stops.txt")
stops_4 = stops_4.loc[:,["stop_name","stop_lat","stop_lon"]]

gdf_stops_2 = gpd.GeoDataFrame(
    stops_2, geometry=gpd.points_from_xy(stops_2['stop_lon'], stops_2['stop_lat']))
gdf_stops_3 = gpd.GeoDataFrame(
    stops_3, geometry=gpd.points_from_xy(stops_3['stop_lon'], stops_3['stop_lat']))
gdf_stops_4 = gpd.GeoDataFrame(
    stops_4, geometry=gpd.points_from_xy(stops_4['stop_lon'], stops_4['stop_lat']))

gdf_stops_2.crs = 'EPSG: 4326'
gdf_stops_3.crs = 'EPSG: 4326'
gdf_stops_4.crs = 'EPSG: 4326'
stops2_sa2 = gdf_stops_2.sjoin(sf, how='inner', predicate='within')
stops3_sa2 = gdf_stops_3.sjoin(sf, how='inner', predicate='within')
stops4_sa2 = gdf_stops_4.sjoin(sf, how='inner', predicate='within')

geoJSON_2 = stops2_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()
geoJSON_3 = stops3_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()
geoJSON_4 = stops4_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()

stops2_df = stops2_sa2.groupby('SA2_CODE21')['geometry'].count()
stops2_df = stops2_df.to_frame().reset_index()
stops2_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops3_df = stops3_sa2.groupby('SA2_CODE21')['geometry'].count()
stops3_df = stops3_df.to_frame().reset_index()
stops3_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops4_df = stops4_sa2.groupby('SA2_CODE21')['geometry'].count()
stops4_df = stops4_df.to_frame().reset_index()
stops4_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops2_sa2 = pd.merge(left=stops2_sa2, right=stops2_df, left_on='SA2_CODE21', right_on='SA2_code')

#### 5. School

In [8]:
df_school = pd.read_csv('../data/raw/school2021.csv')

# Filter the columns and rename them.
df_school = df_school[['X', 'Y', 'Address_Line_1', 'School_Name']]  # type: ignore
df_school.rename(columns={'X':'school_X', 'Y':'school_Y',
                          'Address_Line_1':'school_addr', 'School_Name': 'school_name'}, inplace = True)

# now change our school dataframe to geodataframe by transforming the x, y coordinates to point geometry.
gdf_school = gpd.GeoDataFrame(
    df_school, geometry=gpd.points_from_xy(df_school['school_X'], df_school['school_Y']))

gdf_school.crs = 'EPSG: 4326'

# Then apply spatial join
school_sa2 = gdf_school.sjoin(sf, how='inner', predicate='within')

school_df = school_sa2.groupby('SA2_CODE21')['geometry'].count()
school_df = school_df.to_frame().reset_index()
school_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'school_count'}, inplace = True)

#### 6. Hospital

In [9]:
df_hospital = pd.read_csv('../data/raw/hospital2021.csv')
# Filter the columns and rename them.
df_hospital = df_hospital[['X','Y','addr_street','name']]
df_hospital.rename(columns={'X':'hospital_X', 'Y':'hospital_Y', 'addr_street':'hospital_addr','name':'hospital_name'}, inplace = True)

# Remove the null value.
df_hospital = df_hospital.dropna(axis=0, subset=['hospital_X', 'hospital_Y'])
# now change our hospital dataframe to geodataframe by transforming the x, y coordinates to point geometry.
gdf_hospital = gpd.GeoDataFrame(
    df_hospital, geometry=gpd.points_from_xy(df_hospital['hospital_X'], df_hospital['hospital_Y']))

gdf_hospital.crs = 'EPSG: 4326'

# Then apply spatial join
hospital_sa2 = gdf_hospital.sjoin(sf, how='inner', predicate='within')

hospital_df = hospital_sa2.groupby('SA2_CODE21')['geometry'].count()
hospital_df = hospital_df.to_frame().reset_index()
hospital_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'hos_count'}, inplace = True)

### 7. Domain Rental Prices

In [46]:
import json
import pandas as pd

df1 = pd.read_json('../data/raw/example.json')
df2 = pd.read_json('../data/raw/example2.json')
df3 = pd.read_json('../data/raw/example3.json')
df4 = pd.read_json('../data/raw/example4.json')

df_domain = pd.concat([df1, df2, df3, df4], axis=1)
df_domain

Unnamed: 0,https://www.domain.com.au/667-glenhuntly-road-caulfield-vic-3162-11598047,https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913,https://www.domain.com.au/level-3-302-13-15-lake-street-caroline-springs-vic-3023-15994395,https://www.domain.com.au/9-lansdowne-street-blairgowrie-vic-3942-12127675,https://www.domain.com.au/3502-14-16-the-esplanade-st-kilda-vic-3182-16002767,https://www.domain.com.au/440-keys-road-flinders-vic-3929-16065322,https://www.domain.com.au/335-the-esplanade-indented-head-vic-3223-12688424,https://www.domain.com.au/50-south-wharf-drive-docklands-vic-3008-16048359,https://www.domain.com.au/901-902-85-market-street-south-melbourne-vic-3205-14089455,https://www.domain.com.au/7-haverbrack-avenue-malvern-vic-3144-16041473,...,https://www.domain.com.au/6-11-bay-street-parkdale-vic-3195-16056196,https://www.domain.com.au/32-fourth-street-parkdale-vic-3195-16088448,https://www.domain.com.au/16-morrah-street-parkville-vic-3052-14737377,https://www.domain.com.au/85-boundary-street-port-melbourne-vic-3207-15303937,https://www.domain.com.au/55-esplanade-west-port-melbourne-vic-3207-16069381,https://www.domain.com.au/17-201-graham-street-port-melbourne-vic-3207-16052558,https://www.domain.com.au/164-commercial-road-prahran-vic-3181-12926600,https://www.domain.com.au/32-smith-street-richmond-vic-3121-16068014,https://www.domain.com.au/12-morobe-street-sorrento-vic-3943-15917575,https://www.domain.com.au/102-5-john-street-south-melbourne-vic-3205-15567909
name,667 Glenhuntly Road Caulfield VIC 3162,Upstairs 2C Staley Street Brunswick VIC 3056,"Level 3, 302/13-15 Lake Street Caroline Spring...",9 Lansdowne Street Blairgowrie VIC 3942,3502/14-16 The Esplanade St Kilda VIC 3182,440 Keys Road Flinders VIC 3929,335 The Esplanade Indented Head VIC 3223,50 South Wharf Drive Docklands VIC 3008,901 & 902/85 Market Street South Melbourne VIC...,7 Haverbrack Avenue Malvern VIC 3144,...,6/11 Bay Street Parkdale VIC 3195,32 Fourth Street Parkdale VIC 3195,16 Morrah Street Parkville VIC 3052,85 Boundary Street Port Melbourne VIC 3207,55 Esplanade West Port Melbourne VIC 3207,17/201 Graham Street Port Melbourne VIC 3207,164 Commercial Road Prahran VIC 3181,32 Smith Street Richmond VIC 3121,12 Morobe Street Sorrento VIC 3943,102/5 John Street South Melbourne VIC 3205
cost_text,"$38,000 p.a. Incl. Outgoings + GST","$35,000 Annually",4125000 pw,"$5,000 per week","$5,000",$4000 pw,"$3,775 wk $545 pn",$3750 per week,$3750.00 per week,$3500 Per Week,...,$850,$850.00,$850pw / $3693pcm,$850 per week,$850.00,$850.00,$850pw - Stay 1 mth+,$850.00,$850 pw,$850 per week
coordinates,"[-37.8860233, 145.0173065]","[-37.7655919, 144.9633048]","[-37.7316459, 144.7446886]","[-38.372703, 144.7856897]","[-37.8650177, 144.9746821]","[-38.4788173, 144.9643428]","[-38.1439923, 144.7147899]","[-37.8223967, 144.938237]","[-37.8301164, 144.9569041]","[-37.8534875, 145.0311415]",...,"[-38.00308, 145.0810804]","[-37.9868327, 145.0746213]","[-37.795505, 144.957196]","[-37.829375, 144.9439995]","[-37.8426555, 144.9411595]","[-37.839572, 144.939072]","[-37.8467793, 144.9916795]","[-37.8127029, 144.9946824]","[-38.3543648, 144.758948]","[-37.8329942, 144.9598221]"
rooms,[],"[0 Beds, 0 Baths, 2 Parking]",[],"[3 Beds, 2 Baths, 3 Parking]","[3 Beds, 3 Baths, 3 Parking]","[6 Beds, 4 Baths, 2 Parking]","[4 Beds, 3 Baths]","[5 Beds, 6 Baths, 3 Parking]","[3 Beds, 3 Baths, 1 Parking]","[5 Beds, 4 Baths, 2 Parking]",...,"[3 Beds, 2 Baths, 2 Parking]","[4 Beds, 2 Baths, 2 Parking]","[3 Beds, 1 Bath]","[3 Beds, 3 Baths, 2 Parking]","[2 Beds, 2 Baths]","[3 Beds, 1 Bath, 1 Parking]","[1 Bed, 1 Bath, 1 Parking]","[3 Beds, 1 Bath]","[5 Beds, 2 Baths]","[2 Beds, 2 Baths, 1 Parking]"
desc,Can you hear it? Opportunity knocking!!,1st floor offices/studios\nEasy walking distan...,- Centrally located with tranquil views.\n- Le...,Phone enquiry code for this property : 2751,Inspired by the interplay of timeless design a...,Boasting sublime architectural design and beau...,This stylish 4 bedroom home will surely make y...,This spectacular five bedroom freehold waterfr...,"class=""css-dxogle"">* Unverified feature<svg a...",Darren McMullin,...,"class=""css-dxogle"">* Unverified feature<svg a...",Vivian Hughes,"class=""css-dxogle"">* Unverified feature<svg a...","class=""css-dxogle"">* Unverified feature<svg a...","class=""css-dxogle"">* Unverified feature<svg a...",Annie Gillan,"Furnished apartments include all utilities, Fo...","class=""css-dxogle"">* Unverified feature<svg a...",12 months minimum term,"class=""css-dxogle"">* Unverified feature<svg a..."


In [47]:
#pd.read_json('../data/raw/example.json')

In [48]:
#df_domain = pd.read_json('../data/raw/domain.json')
df_domain = df_domain.T
df_domain = df_domain.drop('desc', axis = 1)
df_domain = df_domain[df_domain['rooms'].astype(bool)]
df_domain = df_domain.reset_index(drop = True)

bed = []
bath = []
parking = []
Longitude = []
Latitude = []

for i in df_domain.index:
    for r in ((',', ''), ('$', ''), ('.00', '')):
        df_domain['cost_text'][i] = df_domain['cost_text'][i].replace(*r)
    # convert room col: [bed, bath, parking] to individual col
    bed.append(df_domain['rooms'][i][0][0])
    bath.append(df_domain['rooms'][i][1][0]) 
    if len(df_domain['rooms'][i]) == 3:    # if no parking info, default no parkings      
        parking.append(df_domain['rooms'][i][2][0])
    else:     
        parking.append('0')
    Longitude.append(df_domain['coordinates'][i][0])
    Latitude.append(df_domain['coordinates'][i][1])
    
df_domain['Prices'] = df_domain['cost_text'].str.extract('(\d+)') 
df_domain['Prices'] = pd.to_numeric(df_domain['Prices'])
df_domain['Bedrooms'] = np.array(bed).tolist()
df_domain['Bathrooms'] = np.array(bath).tolist()
df_domain['Parkings'] = np.array(parking).tolist()
df_domain['Longitude'] = Longitude
df_domain['Latitude'] = Latitude

df_domain = df_domain.drop('rooms', axis = 1)
df_domain = df_domain.drop('cost_text', axis = 1)
df_domain = df_domain.dropna().reset_index(drop = True)
df_domain = df_domain.drop('coordinates', axis = 1)

# remove extreme prices
df_domain = df_domain[(df_domain['Prices'] > 100) & (df_domain['Prices'] < 10000)]
df_domain = df_domain.reset_index(drop = True)

df_domain

Unnamed: 0,name,Prices,Bedrooms,Bathrooms,Parkings,Longitude,Latitude
0,9 Lansdowne Street Blairgowrie VIC 3942,5000.0,3,2,3,-38.372703,144.785690
1,3502/14-16 The Esplanade St Kilda VIC 3182,5000.0,3,3,3,-37.865018,144.974682
2,440 Keys Road Flinders VIC 3929,4000.0,6,4,2,-38.478817,144.964343
3,335 The Esplanade Indented Head VIC 3223,3775.0,4,3,0,-38.143992,144.714790
4,50 South Wharf Drive Docklands VIC 3008,3750.0,5,6,3,-37.822397,144.938237
...,...,...,...,...,...,...,...
3752,17/201 Graham Street Port Melbourne VIC 3207,850.0,3,1,1,-37.839572,144.939072
3753,164 Commercial Road Prahran VIC 3181,850.0,1,1,1,-37.846779,144.991680
3754,32 Smith Street Richmond VIC 3121,850.0,3,1,0,-37.812703,144.994682
3755,12 Morobe Street Sorrento VIC 3943,850.0,5,2,0,-38.354365,144.758948


In [49]:
# build geodataframe
gpd_domain = gpd.GeoDataFrame(
    df_domain, geometry=gpd.points_from_xy(df_domain.Latitude, df_domain.Longitude))

# standardlize crs
gpd_domain.crs = "EPSG:4326"
sf.crs = "EPSG:4326"

gpd_domain_final = gpd_domain.sjoin(sf, how="inner", predicate='within')
gpd_domain_final = gpd_domain_final.reset_index(drop = True)

gpd_domain_final['SA2_code'] = gpd_domain_final['SA2_CODE21']

gpd_domain_final = pd.merge(gpd_domain_final, df_sub, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, stops2_sa2, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, hospital_df, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, school_df, on='SA2_code')
#gpd_domain_final = pd.merge(gpd_domain_final, df_income, on='SA2_code')
#df_income['SA2_CODE21'] = df_income['sa2_maincode_2016'] # 2016-2021, should uwe 2021 later

In [50]:
att = ['name', 'Prices', 'Bedrooms', 'Bathrooms', 'Parkings', 'geometry_x', 'SA2_code', 'hos_count', 'school_count',
       'Longitude', 'Latitude', 'stop_count', 'SA2 name', '2021', 'change %']
gpd_domain_final = gpd_domain_final[att]
gpd_domain_final

Unnamed: 0,name,Prices,Bedrooms,Bathrooms,Parkings,geometry_x,SA2_code,hos_count,school_count,Longitude,Latitude,stop_count,SA2 name,2021,change %
0,50 South Wharf Drive Docklands VIC 3008,3750.0,5,6,3,POINT (144.93824 -37.82240),206041118,8,1,-37.822397,144.938237,1,Docklands,15942,157.3
1,907/2 Glenti Place Docklands VIC 3008,1200.0,3,2,2,POINT (144.93909 -37.81540),206041118,8,1,-37.815405,144.939092,1,Docklands,15942,157.3
2,91/55 Victoria Harbour Promenade Docklands VIC...,1150.0,3,2,2,POINT (144.94274 -37.81956),206041118,8,1,-37.819562,144.942741,1,Docklands,15942,157.3
3,28 Cumberland Street Docklands VIC 3008,1100.0,2,2,1,POINT (144.94271 -37.81968),206041118,8,1,-37.819681,144.942705,1,Docklands,15942,157.3
4,2003/90 Lorimer Street Docklands VIC 3008,1100.0,2,2,2,POINT (144.94386 -37.82456),206041118,8,1,-37.824560,144.943860,1,Docklands,15942,157.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,5/99-103 Carrington Road Box Hill VIC 3128,850.0,4,3,2,POINT (145.11733 -37.81995),207031163,8,5,-37.819954,145.117327,1,Box Hill,23232,27.4
3459,2501/828 Whitehorse Road Box Hill VIC 3128,900.0,3,2,2,POINT (145.11724 -37.81735),207031163,8,5,-37.817345,145.117237,1,Box Hill,23232,27.4
3460,3 Gibson Street Box Hill South VIC 3128,850.0,4,3,2,POINT (145.11868 -37.82918),207031163,8,5,-37.829182,145.118681,1,Box Hill,23232,27.4
3461,2601/826-834 Whitehorse Road Box Hill VIC 3128,900.0,3,2,2,POINT (145.11685 -37.81725),207031163,8,5,-37.817254,145.116849,1,Box Hill,23232,27.4


In [13]:
#distance.distance((gpd_domain_final['Longitude'][0], gpd_domain_final['Latitude'][0]), 
                       #gpd_domain_final['Longitude'][1], gpd_domain_final['Latitude'][1])