# Data Cleaning

In [13]:
import pandas as pd
import requests
import numpy as np
import re
import os
import json
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
from geopy import distance
import openrouteservice as ors
import time

## SA2

In [14]:
SA2_shape = gpd.read_file("../data/landing/SA2-shapefile/")
# SA2_shape

SA2_info = ['SA2_CODE21', 'SA2_NAME21', 'geometry']
SA2_shape = SA2_shape[SA2_info]
SA2_shape

Unnamed: 0,SA2_CODE21,SA2_NAME21,geometry
0,101021007,Braidwood,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,"POLYGON ((149.21899 -35.36738, 149.218 -35.366..."
2,101021009,Queanbeyan,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,Queanbeyan - East,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,Queanbeyan West - Jerrabomberra,"POLYGON ((149.19572 -35.36126, 149.1997 -35.35..."
...,...,...,...
2468,901031003,Jervis Bay,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556..."
2469,901041004,Norfolk Island,"MULTIPOLYGON (((167.96325 -29.07212, 167.96326..."
2470,997979799,Migratory - Offshore - Shipping (OT),
2471,999999499,No usual address (OT),


In [15]:
SA2_shape.crs = 'EPSG: 4326'
SA2_shape

Unnamed: 0,SA2_CODE21,SA2_NAME21,geometry
0,101021007,Braidwood,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,"POLYGON ((149.21899 -35.36738, 149.218 -35.366..."
2,101021009,Queanbeyan,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,Queanbeyan - East,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,Queanbeyan West - Jerrabomberra,"POLYGON ((149.19572 -35.36126, 149.1997 -35.35..."
...,...,...,...
2468,901031003,Jervis Bay,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556..."
2469,901041004,Norfolk Island,"MULTIPOLYGON (((167.96325 -29.07212, 167.96326..."
2470,997979799,Migratory - Offshore - Shipping (OT),
2471,999999499,No usual address (OT),


## Income

### Table 1

In [86]:
inc_tab1 = pd.read_excel("../data/landing/income/income.xlsx", "Table 1",header=6)

# Description removed
inc_tab1 = inc_tab1.drop(index=range(19229,19235))

In [87]:
# unused rows removed
inc_tab1 = inc_tab1[~inc_tab1['Year'].isin([2011.0, 2018.0, 2019.0])]
print(inc_tab1)

            Code            Label    Year Employee income earners (no.)  \
1              0        Australia  2014.0                      11508888   
2              0        Australia  2015.0                      12035235   
3              0        Australia  2016.0                      12171794   
4              0        Australia  2017.0                      12442032   
8              1  New South Wales  2014.0                       3566969   
...          ...              ...     ...                           ...   
19219  901031003       Jervis Bay  2017.0                             -   
19223  901041004   Norfolk Island  2014.0                            89   
19224  901041004   Norfolk Island  2015.0                            71   
19225  901041004   Norfolk Island  2016.0                           112   
19226  901041004   Norfolk Island  2017.0                             -   

      Employee income earners - median age (years) Total employee income ($m)  \
1                 

### Table 2

In [88]:
inc_tab2 = pd.read_excel("../data/landing/income/income.xlsx", "Table 2",header=6)
inc_tab2 = inc_tab2.drop(index=range(3808,3814))
inc_tab2

Unnamed: 0,Code,Label,Year,Employee income earners (no.),Employee income earners - median age (years),Total employee income ($m),Median employee income ($),Mean employee income ($),Employee income as main source of income (%),Own unincorporated business income earners (no.),...,Mean household net worth ($),Persons earning $1-$499 per week (%),Persons earning $500-$999 per week (%),Persons earning $1000-$1999 per week (%),Persons earning $2000-$2999 per week (%),Persons earning $3000 or more per week (%),Persons earning nil income (%),Persons with a negative income (%),Income inadequately described or not stated (%),Median equivalised total household income (weekly) ($)
0,10050,Albury (C),2011.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,690
1,10050,Albury (C),2014.0,24502,40,1184.1,42894,48327,77.3,3825,...,-,-,-,-,-,-,-,-,-,-
2,10050,Albury (C),2015.0,25423,39,1259.8,44000,49555,76.7,3916,...,-,-,-,-,-,-,-,-,-,-
3,10050,Albury (C),2016.0,25821,39,1313.2,45382,50857,76.8,3985,...,783.8,30.2,27.1,21.3,3.3,1.6,6,0.4,10,776
4,10050,Albury (C),2017.0,25987,39,1346.7,46612,51820,77.2,3932,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3803,99399,Unincorp. Other Territories,2015.0,936,44,66.3,72951,70840,76.8,147,...,-,-,-,-,-,-,-,-,-,-
3804,99399,Unincorp. Other Territories,2016.0,682,41,46.4,66031,68002,73.7,86,...,-,21.8,23.1,20.9,5.5,1.6,6.7,0.4,20.3,839
3805,99399,Unincorp. Other Territories,2017.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3806,99399,Unincorp. Other Territories,2018.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [89]:
inc_tab2 = inc_tab2[~inc_tab2['Year'].isin([2011.0, 2018.0, 2019.0])]
print(inc_tab2)

       Code                        Label    Year  \
1     10050                   Albury (C)  2014.0   
2     10050                   Albury (C)  2015.0   
3     10050                   Albury (C)  2016.0   
4     10050                   Albury (C)  2017.0   
8     10130        Armidale Regional (A)  2014.0   
...     ...                          ...     ...   
3798  89399           Unincorporated ACT  2017.0   
3802  99399  Unincorp. Other Territories  2014.0   
3803  99399  Unincorp. Other Territories  2015.0   
3804  99399  Unincorp. Other Territories  2016.0   
3805  99399  Unincorp. Other Territories  2017.0   

     Employee income earners (no.)  \
1                            24502   
2                            25423   
3                            25821   
4                            25987   
8                            13174   
...                            ...   
3798                        237194   
3802                          1003   
3803                           93

## Population

### Table 1(SA2)

In [133]:
pop_tab1 = pd.read_excel("../data/landing/population/population.xlsx", "Table 1", header = [5,6])
pop_tab1.columns = [ (col[1]  if str(col[0]).startswith('Unnamed') else col[0]) for col in list(pop_tab1.columns.values)]
pop_tab1 = pop_tab1.drop(["SA3 code", "SA3 name", "SA4 code", "SA4 name"], axis=1)
# pop_tab1[0:2]
pop_tab1

Unnamed: 0,GCCSA code,GCCSA name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1RNSW,Rest of NSW,101021007.0,Braidwood,2760.0,2811.0,2835.0,2844.0,2847.0,2965.0,...,3762.0,3849.0,3950.0,4041.0,4145.0,4218.0,4282.0,4332.0,4366.0,4396.0
1,1RNSW,Rest of NSW,101021008.0,Karabar,9129.0,9199.0,9263.0,9277.0,9209.0,9212.0,...,8731.0,8603.0,8531.0,8530.0,8516.0,8500.0,8535.0,8548.0,8528.0,8483.0
2,1RNSW,Rest of NSW,101021009.0,Queanbeyan,9717.0,9513.0,9522.0,9400.0,9595.0,9682.0,...,11199.0,11213.0,11230.0,11362.0,11460.0,11468.0,11460.0,11375.0,11391.0,11420.0
3,1RNSW,Rest of NSW,101021010.0,Queanbeyan - East,3925.0,4073.0,4219.0,4218.0,4187.0,4319.0,...,4967.0,4961.0,4970.0,5016.0,5079.0,5126.0,5089.0,5097.0,5091.0,5099.0
4,1RNSW,Rest of NSW,101021012.0,Queanbeyan West - Jerrabomberra,9425.0,10257.0,11085.0,11549.0,12046.0,12358.0,...,13193.0,13164.0,13150.0,13090.0,13022.0,12955.0,12821.0,12748.0,12781.0,12873.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,9OTER,Other Territories,901021002.0,Cocos (Keeling) Islands,600.0,568.0,558.0,573.0,588.0,590.0,...,556.0,555.0,546.0,552.0,553.0,591.0,608.0,603.0,616.0,631.0
2452,9OTER,Other Territories,901031003.0,Jervis Bay,542.0,464.0,441.0,428.0,413.0,386.0,...,361.0,367.0,402.0,398.0,386.0,367.0,335.0,309.0,307.0,307.0
2453,9OTER,Other Territories,901041004.0,Norfolk Island,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1757.0,1845.0,1938.0,2015.0,2102.0,2221.0,2211.0,2209.0
2454,,,,Total Australia,19274701.0,19495210.0,19720737.0,19932722.0,20176844.0,20450966.0,...,23475686.0,23815995.0,24190907.0,24592588.0,24963258.0,25334826.0,25649248.0,25685412.0,26014399.0,26648878.0


In [129]:
pop_tab1.columns = [ (col[1]  if str(col[0]).startswith('Unnamed') else col[0]) for col in list(pop_tab1.columns.values)]

In [131]:
pop_tab1

Unnamed: 0,GCCSA code,GCCSA name,SA4 code,SA4 name,SA3 code,SA3 name,SA2 code,SA2 name,2001,2002,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021007.0,Braidwood,2760.0,2811.0,...,3762.0,3849.0,3950.0,4041.0,4145.0,4218.0,4282.0,4332.0,4366.0,4396.0
1,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021008.0,Karabar,9129.0,9199.0,...,8731.0,8603.0,8531.0,8530.0,8516.0,8500.0,8535.0,8548.0,8528.0,8483.0
2,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021009.0,Queanbeyan,9717.0,9513.0,...,11199.0,11213.0,11230.0,11362.0,11460.0,11468.0,11460.0,11375.0,11391.0,11420.0
3,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021010.0,Queanbeyan - East,3925.0,4073.0,...,4967.0,4961.0,4970.0,5016.0,5079.0,5126.0,5089.0,5097.0,5091.0,5099.0
4,1RNSW,Rest of NSW,101.0,Capital Region,10102.0,Queanbeyan,101021012.0,Queanbeyan West - Jerrabomberra,9425.0,10257.0,...,13193.0,13164.0,13150.0,13090.0,13022.0,12955.0,12821.0,12748.0,12781.0,12873.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,9OTER,Other Territories,901.0,Other Territories,90102.0,Cocos (Keeling) Islands,901021002.0,Cocos (Keeling) Islands,600.0,568.0,...,556.0,555.0,546.0,552.0,553.0,591.0,608.0,603.0,616.0,631.0
2452,9OTER,Other Territories,901.0,Other Territories,90103.0,Jervis Bay,901031003.0,Jervis Bay,542.0,464.0,...,361.0,367.0,402.0,398.0,386.0,367.0,335.0,309.0,307.0,307.0
2453,9OTER,Other Territories,901.0,Other Territories,90104.0,Norfolk Island,901041004.0,Norfolk Island,0.0,0.0,...,0.0,0.0,1757.0,1845.0,1938.0,2015.0,2102.0,2221.0,2211.0,2209.0
2454,,,,,,,,Total Australia,19274701.0,19495210.0,...,23475686.0,23815995.0,24190907.0,24592588.0,24963258.0,25334826.0,25649248.0,25685412.0,26014399.0,26648878.0


## School Zone

## PTV

In [13]:
shape_1 = pd.read_csv("../data/landing/ptv/1/1/shapes.txt")
shape_1


Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1-ABY-mjp-10.1.H,-36.084262,146.924527,1,0.00
1,1-ABY-mjp-10.1.H,-36.085028,146.924381,2,86.20
2,1-ABY-mjp-10.1.H,-36.088625,146.923734,3,490.37
3,1-ABY-mjp-10.1.H,-36.090905,146.923265,4,747.44
4,1-ABY-mjp-10.1.H,-36.091681,146.923064,5,835.51
...,...,...,...,...,...
869753,1-WBL-mjp-9.5.R,-37.811439,144.945707,1352,274128.87
869754,1-WBL-mjp-9.5.R,-37.813874,144.948934,1353,274520.89
869755,1-WBL-mjp-9.5.R,-37.814151,144.949289,1354,274564.74
869756,1-WBL-mjp-9.5.R,-37.816379,144.950966,1355,274852.95


In [15]:
stops_1 = pd.read_csv("../data/landing/ptv/1/1/stops.txt")
stops_1 = stops_1.loc[:,["stop_name","stop_lat","stop_lon"]]

In [16]:
stops_1

Unnamed: 0,stop_name,stop_lat,stop_lon
0,Wallan Railway Station (Wallan),-37.416861,145.005372
1,Melton Railway Station (Melton South),-37.703359,144.572216
2,Rockbank Railway Station (Rockbank),-37.729261,144.650631
3,Deer Park Railway Station (Deer Park),-37.777764,144.772304
4,Sunbury Railway Station (Sunbury),-37.579206,144.728165
...,...,...,...
104,Cobblebank Railway Station (Cobblebank),-37.712546,144.604108
105,Raywood Railway Station (Raywood),-36.531959,144.201161
106,Huntly Railway Station (Huntly),-36.665848,144.369820
107,Goornong Railway Station (Goornong),-36.615183,144.503474


In [17]:
gdf_stops_1 = gpd.GeoDataFrame(
    stops_1, geometry=gpd.points_from_xy(stops_1['stop_lon'], stops_1['stop_lat']))

In [18]:
gdf_stops_1

Unnamed: 0,stop_name,stop_lat,stop_lon,geometry
0,Wallan Railway Station (Wallan),-37.416861,145.005372,POINT (145.00537 -37.41686)
1,Melton Railway Station (Melton South),-37.703359,144.572216,POINT (144.57222 -37.70336)
2,Rockbank Railway Station (Rockbank),-37.729261,144.650631,POINT (144.65063 -37.72926)
3,Deer Park Railway Station (Deer Park),-37.777764,144.772304,POINT (144.7723 -37.77776)
4,Sunbury Railway Station (Sunbury),-37.579206,144.728165,POINT (144.72816 -37.57921)
...,...,...,...,...
104,Cobblebank Railway Station (Cobblebank),-37.712546,144.604108,POINT (144.60411 -37.71255)
105,Raywood Railway Station (Raywood),-36.531959,144.201161,POINT (144.20116 -36.53196)
106,Huntly Railway Station (Huntly),-36.665848,144.369820,POINT (144.36982 -36.66585)
107,Goornong Railway Station (Goornong),-36.615183,144.503474,POINT (144.50347 -36.61518)
