## Data Pre-processing

In [1]:
import pandas as pd
import requests
import numpy as np
import re
import os
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
#!pip install geopy
from geopy import distance

### External Data

#### 1. SA2 Shapefile

In [2]:
sf = gpd.read_file("../data/raw/SA2/SA2_2021_AUST_GDA2020.shp")
# only leave useful cols
SA2_inf = ['SA2_CODE21', 'SA2_NAME21', 'geometry']
sf = sf[SA2_inf]
sf.crs = 'EPSG: 4326'
#sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

#### 2. Population

In [3]:
df_popu = pd.read_excel('../data/raw/population.xlsx', sheet_name='Table 1')
# Delete the rows that has Nan in Unnamed 1 column
df_popu = df_popu.dropna(subset=["Unnamed: 1"]) 

In [4]:
# Change the name of columns to the first row of the dataframe
import numpy as np
# Get the first row data
array = np.array(df_popu)
list = array.tolist()
list = list[0]

# Change the column name to the first row data
df_popu.columns = list
# Delete the first row 
df_popu.drop([6], inplace=True)

# Reset index
df_popu.reset_index(drop=True, inplace=True)

df_sub = df_popu.drop(['S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code', 
                  'SA4 name', 'SA3 code', 'SA3 name'], axis=1)
# Change the name of each columns 
df_sub.columns = ['SA2 code','SA2 name', '2001', '2002', '2003', '2004' , 
                  '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', 
                  '2014', '2015', '2016','2017', '2018', '2019', '2020', '2021','NaN','change 01-21', 
                  'change %', 'NaN', 'km2', 'persons/km2']

# Only keep the useful columns 
df_sub = df_sub[['SA2 code', 'SA2 name', '2021', 'change 01-21', 'change %']]
df_sub['SA2_code'] = df_sub['SA2 code']
df_sub['SA2_code'] = df_sub['SA2_code'].apply(str)

#### 3. Income

#### 4. PTV

In [5]:
stops_2 = pd.read_csv("../data/raw/ptv/2/stops.txt")
stops_2 = stops_2.loc[:,["stop_name","stop_lat","stop_lon"]]
stops_3 = pd.read_csv("../data/raw/ptv/3/stops.txt")
stops_3 = stops_3.loc[:,["stop_name","stop_lat","stop_lon"]]
stops_4 = pd.read_csv("../data/raw/ptv/4/stops.txt")
stops_4 = stops_4.loc[:,["stop_name","stop_lat","stop_lon"]]

gdf_stops_2 = gpd.GeoDataFrame(
    stops_2, geometry=gpd.points_from_xy(stops_2['stop_lon'], stops_2['stop_lat']))
gdf_stops_3 = gpd.GeoDataFrame(
    stops_3, geometry=gpd.points_from_xy(stops_3['stop_lon'], stops_3['stop_lat']))
gdf_stops_4 = gpd.GeoDataFrame(
    stops_4, geometry=gpd.points_from_xy(stops_4['stop_lon'], stops_4['stop_lat']))

gdf_stops_2.crs = 'EPSG: 4326'
gdf_stops_3.crs = 'EPSG: 4326'
gdf_stops_4.crs = 'EPSG: 4326'
stops2_sa2 = gdf_stops_2.sjoin(sf, how='inner', predicate='within')
stops3_sa2 = gdf_stops_3.sjoin(sf, how='inner', predicate='within')
stops4_sa2 = gdf_stops_4.sjoin(sf, how='inner', predicate='within')

geoJSON_2 = stops2_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()
geoJSON_3 = stops3_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()
geoJSON_4 = stops4_sa2[['index_right', 'geometry']].drop_duplicates('index_right').to_json()

stops2_df = stops2_sa2.groupby('SA2_CODE21')['geometry'].count()
stops2_df = stops2_df.to_frame().reset_index()
stops2_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops3_df = stops3_sa2.groupby('SA2_CODE21')['geometry'].count()
stops3_df = stops3_df.to_frame().reset_index()
stops3_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops4_df = stops4_sa2.groupby('SA2_CODE21')['geometry'].count()
stops4_df = stops4_df.to_frame().reset_index()
stops4_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'stop_count'}, inplace = True)

stops2_sa2 = pd.merge(left=stops2_sa2, right=stops2_df, left_on='SA2_CODE21', right_on='SA2_code')

#### 5. School

In [6]:
df_school = pd.read_csv('../data/raw/school2021.csv')
df_school = df_school[['X', 'Y', 'Address_Line_1', 'School_Name']]  # type: ignore
df_school.rename(columns={'X':'school_X', 'Y':'school_Y',
                          'Address_Line_1':'school_addr', 'School_Name': 'school_name'}, inplace = True)
gdf_school = gpd.GeoDataFrame(
    df_school, geometry=gpd.points_from_xy(df_school['school_X'], df_school['school_Y']))

gdf_school.crs = 'EPSG: 4326'

school_sa2 = gdf_school.sjoin(sf, how='inner', predicate='within')

school_df = school_sa2.groupby('SA2_CODE21')['geometry'].count()
school_df = school_df.to_frame().reset_index()
school_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'school_count'}, inplace = True)

#### 6. Hospital

In [7]:
df_hospital = pd.read_csv('../data/raw/hospital2021.csv')
# Filter the columns and rename them.
df_hospital = df_hospital[['X','Y','addr_street','name']]
df_hospital.rename(columns={'X':'hospital_X', 'Y':'hospital_Y', 'addr_street':'hospital_addr','name':'hospital_name'}, inplace = True)
# Remove the null value.
df_hospital = df_hospital.dropna(axis=0, subset=['hospital_X', 'hospital_Y'])
# now change our hospital dataframe to geodataframe by transforming the x, y coordinates to point geometry.
gdf_hospital = gpd.GeoDataFrame(
    df_hospital, geometry=gpd.points_from_xy(df_hospital['hospital_X'], df_hospital['hospital_Y']))

gdf_hospital.crs = 'EPSG: 4326'
# Then apply spatial join
hospital_sa2 = gdf_hospital.sjoin(sf, how='inner', predicate='within')

hospital_df = hospital_sa2.groupby('SA2_CODE21')['geometry'].count()
hospital_df = hospital_df.to_frame().reset_index()
hospital_df.rename(columns = {'SA2_CODE21':'SA2_code', 'geometry':'hos_count'}, inplace = True)

### Domain Rental Prices

In [8]:
df_domain = pd.read_json('../data/raw/example.json')
df_domain = df_domain.T
df_domain = df_domain.drop('desc', axis = 1)
df_domain = df_domain[df_domain['rooms'].astype(bool)]
df_domain = df_domain.reset_index(drop = True)

bed = []
bath = []
parking = []
Longitude = []
Latitude = []

for i in df_domain.index:
    for r in ((',', ''), ('$', ''), ('.00', '')):
        df_domain['cost_text'][i] = df_domain['cost_text'][i].replace(*r)
    # convert room col: [bed, bath, parking] to individual col
    bed.append(df_domain['rooms'][i][0][0])
    bath.append(df_domain['rooms'][i][1][0]) 
    if len(df_domain['rooms'][i]) == 3:    # if no parking info, default no parkings      
        parking.append(df_domain['rooms'][i][2][0])
    else:     
        parking.append('0')
    Longitude.append(df_domain['coordinates'][i][0])
    Latitude.append(df_domain['coordinates'][i][1])
    
df_domain['Prices'] = df_domain['cost_text'].str.extract('(\d+)') 
df_domain['Prices'] = pd.to_numeric(df_domain['Prices'])
df_domain['Bedrooms'] = np.array(bed).tolist()
df_domain['Bathrooms'] = np.array(bath).tolist()
df_domain['Parkings'] = np.array(parking).tolist()
df_domain['Longitude'] = Longitude
df_domain['Latitude'] = Latitude

df_domain = df_domain.drop('rooms', axis = 1)
df_domain = df_domain.drop('cost_text', axis = 1)
df_domain = df_domain.dropna().reset_index(drop = True)
df_domain = df_domain.drop('coordinates', axis = 1)

# remove extreme prices
df_domain = df_domain[(df_domain['Prices'] > 100) & (df_domain['Prices'] < 10000)]
df_domain = df_domain.reset_index(drop = True)

In [9]:
# build geodataframe
gpd_domain = gpd.GeoDataFrame(
    df_domain, geometry=gpd.points_from_xy(df_domain.Latitude, df_domain.Longitude))

# standardlize crs
gpd_domain.crs = "EPSG:4326"
sf.crs = "EPSG:4326"

gpd_domain_final = gpd_domain.sjoin(sf, how="inner", predicate='within')
gpd_domain_final = gpd_domain_final.reset_index(drop = True)

gpd_domain_final['SA2_code'] = gpd_domain_final['SA2_CODE21']

gpd_domain_final = pd.merge(gpd_domain_final, df_sub, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, stops2_sa2, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, hospital_df, on='SA2_code')
gpd_domain_final = pd.merge(gpd_domain_final, school_df, on='SA2_code')
#gpd_domain_final = pd.merge(gpd_domain_final, df_income, on='SA2_code')
#df_income['SA2_CODE21'] = df_income['sa2_maincode_2016'] # 2016-2021, should uwe 2021 later

In [16]:
att = ['name', 'Prices', 'Bedrooms', 'Bathrooms', 'Parkings', 'geometry_x', 'SA2_code', 'hos_count', 'school_count',
       'Longitude', 'Latitude', 'stop_count', 'SA2 name', '2021', 'change %']
gpd_domain_final = gpd_domain_final[att]
gpd_domain_final

Unnamed: 0,name,Prices,Bedrooms,Bathrooms,Parkings,geometry_x,SA2_code,hos_count,school_count,Longitude,Latitude,stop_count,SA2 name,2021,change %
0,50 South Wharf Drive Docklands VIC 3008,3750.0,5,6,3,POINT (144.93824 -37.82240),206041118,8,1,-37.822397,144.938237,1,Docklands,15942,157.3
1,907/2 Glenti Place Docklands VIC 3008,1200.0,3,2,2,POINT (144.93909 -37.81540),206041118,8,1,-37.815405,144.939092,1,Docklands,15942,157.3
2,91/55 Victoria Harbour Promenade Docklands VIC...,1150.0,3,2,2,POINT (144.94274 -37.81956),206041118,8,1,-37.819562,144.942741,1,Docklands,15942,157.3
3,28 Cumberland Street Docklands VIC 3008,1100.0,2,2,1,POINT (144.94271 -37.81968),206041118,8,1,-37.819681,144.942705,1,Docklands,15942,157.3
4,18 Aquitania Way Docklands VIC 3008,1015.0,1,1,1,POINT (144.94299 -37.81358),206041118,8,1,-37.813575,144.942994,1,Docklands,15942,157.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,4/80 Airlie Road Montmorency VIC 3094,800.0,3,2,2,POINT (145.12211 -37.72416),209011202,4,7,-37.724159,145.122108,1,Montmorency - Briar Hill,16658,3
842,26B Wallace Avenue Murrumbeena VIC 3163,800.0,3,2,2,POINT (145.06790 -37.90438),208021181,5,2,-37.904382,145.067900,2,Murrumbeena,9471,5.4
843,26B Wallace Avenue Murrumbeena VIC 3163,800.0,3,2,2,POINT (145.06790 -37.90438),208021181,5,2,-37.904382,145.067900,2,Murrumbeena,9471,5.4
844,1A & 1B/101-103 Main Road West St Albans VIC 3021,786.0,1,1,9,POINT (144.81124 -37.75620),213011335,4,6,-37.756201,144.811242,2,St Albans - South,17666,3.4


In [11]:
#distance.distance((gpd_domain_final['Longitude'][0], gpd_domain_final['Latitude'][0]), 
                       #gpd_domain_final['Longitude'][1], gpd_domain_final['Latitude'][1])