# Suburb Proximity Preprocessing Part 1: Data Aggregation

In this notebook the data is aggregated and prepared so proximity data of train stations and distance to the CBD can be computed. Distance and time measures are measured by kilometers and seconds as travelled by car on the most direct route to given the destination from the center of the suburb in question.

Sections:
1. Train Data Aggregation
2. Feature Engineering

    2.1 No. Stations in a Suburb

    2.2 Suburb Centres

    2.3 Two Closest Stations

In [1]:
import pandas as pd
import numpy as np
import re
import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium

# define constants
CBD_LAT = -37.8124
CBD_LNG = 144.962646

### 1 Train Data Aggregation
Gather all train station data into one dataset

In [2]:
# read in first train dataset - metropolitan stations
metropolitan_trains_df = pd.read_csv("../../data/landing/Annual_Metropolitan_Train_Station_Entries_2023-24.csv")
metropolitan_trains_df

Unnamed: 0,Fin_year,Stop_ID,Stop_name,Stop_lat,Stop_long,Pax_annual,Pax_weekday,Pax_norm_weekday,Pax_sch_hol_weekday,Pax_Saturday,Pax_Sunday,Pax_pre_AM_peak,Pax_AM_peak,Pax_interpeak,Pax_PM_peak,Pax_PM_late
0,FY23-24,19829,Morradoo,-38.354033,145.189603,3600,50,50,50,50,50,50,50,50,50,50
1,FY23-24,19831,Hastings,-38.305659,145.185980,14300,50,50,50,50,50,50,50,50,50,50
2,FY23-24,19832,Tyabb,-38.259815,145.186401,3250,50,50,50,50,50,50,50,50,50,50
3,FY23-24,19833,Somerville,-38.225342,145.176245,7400,50,50,50,50,50,50,50,50,50,50
4,FY23-24,19835,Glen Iris,-37.859308,145.058225,268850,850,900,650,550,350,50,400,150,250,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,FY23-24,19836,Leawarra,-38.152034,145.139534,5150,50,50,50,50,50,50,50,50,50,50
218,FY23-24,20028,Showgrounds,-37.783500,144.915000,138000,250,50,1150,650,500,50,50,50,50,50
219,FY23-24,19834,Baxter,-38.194043,145.160526,2550,50,50,50,50,50,50,50,50,50,50
220,FY23-24,19942,Glenhuntly,-37.889719,145.042223,524650,1600,1650,1450,1200,850,100,700,400,350,100


In [3]:
# extract relevant data
metropolitan_trains_df = pd.DataFrame(data=metropolitan_trains_df, columns=["Stop_name", "Stop_lat", "Stop_long"])
metropolitan_trains_df = metropolitan_trains_df.rename(columns={"Stop_name": "STOP_NAME", 
                                                                "Stop_lat": "LATITUDE", 
                                                                "Stop_long": "LONGITUDE"
                                                                })
metropolitan_trains_df

Unnamed: 0,STOP_NAME,LATITUDE,LONGITUDE
0,Morradoo,-38.354033,145.189603
1,Hastings,-38.305659,145.185980
2,Tyabb,-38.259815,145.186401
3,Somerville,-38.225342,145.176245
4,Glen Iris,-37.859308,145.058225
...,...,...,...
217,Leawarra,-38.152034,145.139534
218,Showgrounds,-37.783500,144.915000
219,Baxter,-38.194043,145.160526
220,Glenhuntly,-37.889719,145.042223


In [4]:
# read in second train dataset - regional stations
trains_regional_gdf = gpd.read_file(
    "../../data/landing/Order_L8LCPT/gda2020_vicgrid/esrishape/whole_of_dataset/victoria/PTV/PTV_REGIONAL_TRAIN_STATION.shp"
    )
trains_regional_gdf

Unnamed: 0,STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,geometry
0,17204,-37.416861,Wallan Railway Station (Wallan),145.005372,POINT (2500476.019 2453744.181)
1,19980,-37.703359,Melton Railway Station (Melton South),144.572216,POINT (2462279.390 2421864.847)
2,19981,-37.729261,Rockbank Railway Station (Rockbank),144.650631,POINT (2469204.367 2419018.408)
3,19982,-37.777764,Deer Park Railway Station (Deer Park),144.772304,POINT (2479942.450 2413667.849)
4,19998,-37.579206,Sunbury Railway Station (Sunbury),144.728165,POINT (2475991.284 2435693.792)
...,...,...,...,...,...
105,47642,-36.706342,Epsom Railway Station (Epsom),144.321040,POINT (2439342.475 2532369.037)
106,47647,-37.872886,Wyndham Vale Railway Station (Manor Lakes),144.608732,POINT (2465576.596 2403063.717)
107,47648,-37.832168,Tarneit Railway Station (Tarneit),144.694714,POINT (2473126.809 2407610.618)
108,48804,-37.712546,Cobblebank Railway Station (Cobblebank),144.604108,POINT (2465095.788 2420857.422)


In [5]:
# rename stop name in the trains_regional_gdf so that the station names match up
station_in_brackets = [re.search(r'\(([^()]+)', station).group(1).strip() for station in trains_regional_gdf["STOP_NAME"]]
sorted(station_in_brackets)
trains_regional_gdf["STOP_NAME"] = station_in_brackets
trains_regional_gdf

Unnamed: 0,STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,geometry
0,17204,-37.416861,Wallan,145.005372,POINT (2500476.019 2453744.181)
1,19980,-37.703359,Melton South,144.572216,POINT (2462279.390 2421864.847)
2,19981,-37.729261,Rockbank,144.650631,POINT (2469204.367 2419018.408)
3,19982,-37.777764,Deer Park,144.772304,POINT (2479942.450 2413667.849)
4,19998,-37.579206,Sunbury,144.728165,POINT (2475991.284 2435693.792)
...,...,...,...,...,...
105,47642,-36.706342,Epsom,144.321040,POINT (2439342.475 2532369.037)
106,47647,-37.872886,Manor Lakes,144.608732,POINT (2465576.596 2403063.717)
107,47648,-37.832168,Tarneit,144.694714,POINT (2473126.809 2407610.618)
108,48804,-37.712546,Cobblebank,144.604108,POINT (2465095.788 2420857.422)


In [6]:
# Merging the DataFrames
trains_merged_df = pd.merge(trains_regional_gdf, metropolitan_trains_df, on="STOP_NAME", suffixes=('_df1', '_df2'), how='outer')

# Choose lat and long from df1 if they are different
trains_merged_df["LATITUDE"] = trains_merged_df.apply(lambda row: row["LATITUDE_df1"] \
    if pd.notna(row["LATITUDE_df1"]) else row["LATITUDE_df2"], axis=1)
trains_merged_df["LONGITUDE"] = trains_merged_df.apply(lambda row: row["LONGITUDE_df1"] \
    if pd.notna(row["LONGITUDE_df1"]) else row["LONGITUDE_df2"], axis=1)

# Dropping the extra columns from the merge
trains_merged_df = trains_merged_df[["STOP_NAME","LATITUDE","LONGITUDE",]]
trains_merged_df

Unnamed: 0,STOP_NAME,LATITUDE,LONGITUDE
0,Aircraft,-37.866606,144.760809
1,Alamein,-37.868320,145.079656
2,Albion,-37.777653,144.824704
3,Albury,-36.084262,146.924515
4,Alphington,-37.778394,145.031255
...,...,...,...
316,Wodonga,-36.105827,146.871266
317,Woodend,-37.358799,144.525890
318,Yarragon,-38.203158,146.063063
319,Yarraman,-37.978255,145.191600


In [7]:
# drop stations that are repeated entries or outside of victoria
trains_merged_df = trains_merged_df.drop(trains_merged_df[trains_merged_df["STOP_NAME"]=="Melbourne City"].index[0])
trains_merged_df = trains_merged_df.drop(trains_merged_df[trains_merged_df["STOP_NAME"]=="Melbourne City"].index[0])
trains_merged_df = trains_merged_df.drop(trains_merged_df[trains_merged_df["STOP_NAME"]=="Albury"].index[0])
trains_merged_df

Unnamed: 0,STOP_NAME,LATITUDE,LONGITUDE
0,Aircraft,-37.866606,144.760809
1,Alamein,-37.868320,145.079656
2,Albion,-37.777653,144.824704
4,Alphington,-37.778394,145.031255
5,Altona,-37.867148,144.829645
...,...,...,...
316,Wodonga,-36.105827,146.871266
317,Woodend,-37.358799,144.525890
318,Yarragon,-38.203158,146.063063
319,Yarraman,-37.978255,145.191600


### 2 Feature Engineering

#### 2.2 No. Stations in a Suburb

In [8]:
# Read in SAL boundaries as supplied by the ABS dataset
SAL_gdf = gpd.read_file("../../data/landing/SAL_data/SAL_2021_AUST_GDA2020.shp")
SAL_gdf

Unnamed: 0,SAL_CODE21,SAL_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
0,10001,Aarons Pass,1,New South Wales,AUS,Australia,82.7639,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.554241,0.007975,"POLYGON ((149.82477 -32.84384, 149.83271 -32.8..."
1,10002,Abbotsbury,1,New South Wales,AUS,Australia,4.9788,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.123051,0.000485,"POLYGON ((150.86523 -33.88264, 150.86479 -33.8..."
2,10003,Abbotsford (NSW),1,New South Wales,AUS,Australia,1.0180,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.053423,0.000099,"POLYGON ((151.13472 -33.85492, 151.13445 -33.8..."
3,10004,Abercrombie,1,New South Wales,AUS,Australia,2.9775,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.097338,0.000289,"POLYGON ((149.55192 -33.39280, 149.55148 -33.3..."
4,10005,Abercrombie River,1,New South Wales,AUS,Australia,127.1701,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.848903,0.012397,"POLYGON ((149.25562 -33.96535, 149.25563 -33.9..."
...,...,...,...,...,...,...,...,...,...,...,...
15348,90004,Norfolk Island,9,Other Territories,AUS,Australia,38.6510,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.629774,0.003580,"MULTIPOLYGON (((167.94051 -29.06260, 167.94046..."
15349,90005,West Island,9,Other Territories,AUS,Australia,5.9276,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.416115,0.000492,"MULTIPOLYGON (((96.82779 -12.17627, 96.82773 -..."
15350,99494,No usual address (OT),9,Other Territories,AUS,Australia,0.0000,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.000000,0.000000,
15351,99797,Migratory - Offshore - Shipping (OT),9,Other Territories,AUS,Australia,0.0000,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.000000,0.000000,


In [9]:
# filter out all non Victorian border
SAL_gdf = SAL_gdf[SAL_gdf["STE_NAME21"]== "Victoria"]
SAL_gdf = SAL_gdf[SAL_gdf["geometry"]!=None] # remove those codes without SAL borders
SAL_gdf = SAL_gdf.rename(columns={"SAL_CODE21": "SAL_CODE_2021"})
SAL_gdf

Unnamed: 0,SAL_CODE_2021,SAL_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
4544,20001,Abbeyard,2,Victoria,AUS,Australia,327.5008,http://linked.data.gov.au/dataset/asgsed3/SAL/...,1.207678,0.033162,"POLYGON ((146.89824 -37.04602, 146.89947 -37.0..."
4545,20002,Abbotsford (Vic.),2,Victoria,AUS,Australia,1.7405,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.092990,0.000178,"POLYGON ((145.00195 -37.79665, 145.00190 -37.7..."
4546,20003,Aberfeldie,2,Victoria,AUS,Australia,1.5515,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.059374,0.000159,"POLYGON ((144.89576 -37.76514, 144.89547 -37.7..."
4547,20004,Aberfeldy,2,Victoria,AUS,Australia,10.8319,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.183199,0.001107,"POLYGON ((146.38814 -37.72232, 146.38808 -37.7..."
4548,20005,Acheron,2,Victoria,AUS,Australia,72.6602,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.514040,0.007381,"POLYGON ((145.76731 -37.25433, 145.76757 -37.2..."
...,...,...,...,...,...,...,...,...,...,...,...
7483,22940,Yundool,2,Victoria,AUS,Australia,31.6396,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.256626,0.003174,"POLYGON ((145.86040 -36.28432, 145.86038 -36.2..."
7484,22941,Yuroke,2,Victoria,AUS,Australia,8.8851,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.158951,0.000906,"POLYGON ((144.85250 -37.55800, 144.85303 -37.5..."
7485,22942,Yuulong,2,Victoria,AUS,Australia,52.1657,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.447416,0.005404,"POLYGON ((143.32185 -38.68969, 143.32203 -38.6..."
7486,22943,Zeerust,2,Victoria,AUS,Australia,18.0294,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.211342,0.001808,"POLYGON ((145.40454 -36.25294, 145.40479 -36.2..."


In [10]:
SAL_geoJSON = SAL_gdf.to_json()

In [11]:
# create map of SAL borders
sal_map = folium.Map(location=[CBD_LAT, CBD_LNG], zoom_start=10)

sal_map.add_child(folium.Choropleth(
    geo_data=SAL_geoJSON,
    name="choropleth",
    fill_color="green",
))
for _, row in trains_merged_df.iterrows():
    folium.Marker(
        location=[row["LATITUDE"], row["LONGITUDE"]],
        popup=row["STOP_NAME"],
        icon=folium.Icon(color="blue")  # Set marker color
    ).add_to(sal_map)

In [12]:
# Convert the DataFrame of station coordinates to a GeoDataFrame
geometry = [Point(xy) for xy in zip(trains_merged_df['LONGITUDE'], trains_merged_df['LATITUDE'])]
trains_gdf = gpd.GeoDataFrame(trains_merged_df, geometry=geometry, crs="EPSG:4326")

trains_gdf

Unnamed: 0,STOP_NAME,LATITUDE,LONGITUDE,geometry
0,Aircraft,-37.866606,144.760809,POINT (144.76081 -37.86661)
1,Alamein,-37.868320,145.079656,POINT (145.07966 -37.86832)
2,Albion,-37.777653,144.824704,POINT (144.82470 -37.77765)
4,Alphington,-37.778394,145.031255,POINT (145.03125 -37.77839)
5,Altona,-37.867148,144.829645,POINT (144.82964 -37.86715)
...,...,...,...,...
316,Wodonga,-36.105827,146.871266,POINT (146.87127 -36.10583)
317,Woodend,-37.358799,144.525890,POINT (144.52589 -37.35880)
318,Yarragon,-38.203158,146.063063,POINT (146.06306 -38.20316)
319,Yarraman,-37.978255,145.191600,POINT (145.19160 -37.97825)


In [13]:
# Perform a spatial join to find stations within all SAL suburbs
points_within_polygons = gpd.sjoin(trains_gdf, SAL_gdf, how="inner", op='within')

# Count how many stations that fall within each SAL suburb
polygon_point_counts = points_within_polygons.groupby('index_right').size()

polygon_point_counts

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:7844

  points_within_polygons = gpd.sjoin(trains_gdf, SAL_gdf, how="inner", op='within')


index_right
4545    2
4564    1
4577    1
4578    2
4596    1
       ..
7348    1
7362    1
7376    1
7456    1
7460    1
Length: 244, dtype: int64

In [14]:
# append number of stations in each suburb to the SAL gdf
SAL_gdf['station_count_in_suburb'] = polygon_point_counts
SAL_gdf['station_count_in_suburb'] = SAL_gdf['station_count_in_suburb'].fillna(0)

#### 2.2 Suburb Centroids

In [15]:
# read suburbs that centres need to be recorded on
historical_rent_df = pd.read_csv('../../data/curated/historical_rent_cleaned.csv')
historical_rent_df

Unnamed: 0,Suburb Cluster,SAL_CODE_2021,SAL suburb,Suburb Group,Mar 2000 Count (of suburb group),Mar 2000 Median,Jun 2000 Count (of suburb group),Jun 2000 Median,Sep 2000 Count (of suburb group),Sep 2000 Median,...,Mar 2023 Count (of suburb group),Mar 2023 Median,Jun 2023 Count (of suburb group),Jun 2023 Median,Sep 2023 Count (of suburb group),Sep 2023 Median,Dec 2023 Count (of suburb group),Dec 2023 Median,Mar 2024 Count (of suburb group),Mar 2024 Median
0,Inner Melbourne,20018,Albert Park,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,...,796,545,740,550,730,600,720,600,671,650
1,Inner Melbourne,21677,Middle Park,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,...,796,545,740,550,730,600,720,600,671,650
2,Inner Melbourne,22345,St Kilda West,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,...,796,545,740,550,730,600,720,600,671,650
3,Inner Melbourne,20066,Armadale,Armadale,733,200,737,200,738,205,...,757,490,687,500,639,525,594,560,566,560
4,Inner Melbourne,20496,Carlton North,Carlton North,864,260,814,260,799,265,...,497,620,495,630,467,650,418,670,384,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,Other Regional Centres,22569,Traralgon,Traralgon,851,125,823,120,831,125,...,919,385,922,390,910,390,880,395,842,410
565,Other Regional Centres,22680,Wangaratta,Wanagaratta,705,125,671,125,631,130,...,535,380,555,390,565,390,593,395,580,400
566,Other Regional Centres,22698,Warragul,Warragul,385,130,367,135,382,135,...,507,440,542,450,558,450,543,460,541,470
567,Other Regional Centres,22710,Warrnambool,Warrnambool,1266,130,1229,135,1204,135,...,881,420,861,430,846,450,844,460,840,460


In [16]:
# filter through to the necessary columns
all_suburbs_df = historical_rent_df[["Suburb Cluster", "SAL suburb", "SAL_CODE_2021", "Suburb Group"]]
all_suburbs_df

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda
3,Inner Melbourne,Armadale,20066,Armadale
4,Inner Melbourne,Carlton North,20496,Carlton North
...,...,...,...,...
564,Other Regional Centres,Traralgon,22569,Traralgon
565,Other Regional Centres,Wangaratta,22680,Wanagaratta
566,Other Regional Centres,Warragul,22698,Warragul
567,Other Regional Centres,Warrnambool,22710,Warrnambool


In [17]:
# Check and align data types
all_suburbs_df['SAL_CODE_2021'] = all_suburbs_df['SAL_CODE_2021'].astype(str)
SAL_gdf['SAL_CODE_2021'] = SAL_gdf['SAL_CODE_2021'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_suburbs_df['SAL_CODE_2021'] = all_suburbs_df['SAL_CODE_2021'].astype(str)


In [18]:
# Merge all_suburbs_df with relevant columns from SAL_gdf based on the 'SAL Suburb code' column
all_suburbs_df = all_suburbs_df.merge(SAL_gdf[['SAL_CODE_2021','AREASQKM21', 'geometry', 'station_count_in_suburb']], \
    on='SAL_CODE_2021', how='left')
all_suburbs_df[all_suburbs_df["station_count_in_suburb"].isnull()]

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb
188,Outer Western Melbourne,Fieldstone,-1,Sydenham,,,


In [19]:
# drop Fieldstone has it has no SAL code therefore must be removed due to lack ABS data
all_suburbs_df = all_suburbs_df.drop(188)
all_suburbs_df

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0
...,...,...,...,...,...,...,...
564,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0
565,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0
566,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0
567,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0


In [20]:
all_suburbs_df = all_suburbs_df.reset_index()
all_suburbs_df

Unnamed: 0,index,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb
0,0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0
1,1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0
2,2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0
3,3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0
4,4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0
...,...,...,...,...,...,...,...,...
563,564,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0
564,565,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0
565,566,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0
566,567,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0


In [21]:
all_suburbs_df = all_suburbs_df.drop(columns="index")

#### 2.3 Two Closest Stations
In this section the two nearest stations.  This is done through calculating the euclidean distance of the two closest stations to the centre of the suburb. The distances and station names are records in the dataset.

In [22]:
all_suburbs_gdf = gpd.GeoDataFrame(all_suburbs_df)
all_suburbs_gdf

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0
...,...,...,...,...,...,...,...
563,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0
564,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0
565,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0
566,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0


In [23]:
all_suburbs_gdf['suburb_centroid'] = all_suburbs_gdf['geometry'].centroid
all_suburbs_gdf


  all_suburbs_gdf['suburb_centroid'] = all_suburbs_gdf['geometry'].centroid


Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,suburb_centroid
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0,POINT (144.96105 -37.84518)
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0,POINT (144.96202 -37.85115)
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0,POINT (144.97125 -37.85796)
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0,POINT (145.02023 -37.85666)
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0,POINT (144.96849 -37.78659)
...,...,...,...,...,...,...,...,...
563,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0,POINT (146.51978 -38.20334)
564,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0,POINT (146.31601 -36.36938)
565,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0,POINT (145.92997 -38.15372)
566,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0,POINT (142.49818 -38.36886)


In [24]:
def euclidean_distance(lat1, lon1, lat2, lon2):
    """ calculate eulcidean distance between two longitude/latitude coordinates """
    return np.sqrt((lat2 - lat1) ** 2 + (lon2 - lon1) ** 2)

def find_closest_stations(row, train_stations, num_stations=2):
    """ Compute distances from the current row to all train stations using Euclidean distance """
    distances = train_stations.apply(
        lambda x: euclidean_distance(row['suburb_centroid'].y, row['suburb_centroid'].x, x['LATITUDE'], x['LONGITUDE']),
        axis=1
    )
    
    # Get the indices of the closest stations
    closest_indices = distances.nsmallest(num_stations).index
    closest_stations = train_stations.loc[closest_indices]
    
    # Return a list of stop names and coordinates
    return closest_stations[['STOP_NAME', 'LATITUDE', 'LONGITUDE']].to_dict(orient='records')

# Apply the function to each row in historical_rent_df
all_suburbs_gdf['closest_stations'] = all_suburbs_gdf.apply(
    lambda row: find_closest_stations(row, trains_gdf), axis=1
)
all_suburbs_gdf

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,suburb_centroid,closest_stations
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0,POINT (144.96105 -37.84518),"[{'STOP_NAME': 'Flinders Street', 'LATITUDE': ..."
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0,POINT (144.96202 -37.85115),"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495..."
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0,POINT (144.97125 -37.85796),"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495..."
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0,POINT (145.02023 -37.85666),"[{'STOP_NAME': 'Armadale', 'LATITUDE': -37.856..."
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0,POINT (144.96849 -37.78659),"[{'STOP_NAME': 'Jewell', 'LATITUDE': -37.77498..."
...,...,...,...,...,...,...,...,...,...
563,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0,POINT (146.51978 -38.20334),"[{'STOP_NAME': 'Traralgon', 'LATITUDE': -38.19..."
564,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0,POINT (146.31601 -36.36938),"[{'STOP_NAME': 'Wangaratta', 'LATITUDE': -36.3..."
565,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0,POINT (145.92997 -38.15372),"[{'STOP_NAME': 'Warragul', 'LATITUDE': -38.165..."
566,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0,POINT (142.49818 -38.36886),"[{'STOP_NAME': 'Warrnambool', 'LATITUDE': -38...."


In [25]:
# Create separate columns for each closest station
def expand_closest_stations(row):
    """ Flatten the list of closest stations into individual columns """
    result = {}
    for i, station in enumerate(row['closest_stations']):
        for key in ['STOP_NAME', 'LATITUDE', 'LONGITUDE']:
            result[f'closest_station_{i+1}_{key}'] = station[key]
    return result

# Apply and merge the expanded closest stations data
expanded_stations = all_suburbs_gdf.apply(expand_closest_stations, axis=1)
expanded_stations_df = pd.DataFrame(expanded_stations.tolist())
# Concatenate the original DataFrame with the new columns
all_suburbs_gdf = pd.concat([all_suburbs_gdf, expanded_stations_df], axis=1)
all_suburbs_gdf

Unnamed: 0,Suburb Cluster,SAL suburb,SAL_CODE_2021,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,suburb_centroid,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,closest_station_1_LONGITUDE,closest_station_2_STOP_NAME,closest_station_2_LATITUDE,closest_station_2_LONGITUDE
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,"POLYGON ((144.95445 -37.83794, 144.95691 -37.8...",0.0,POINT (144.96105 -37.84518),"[{'STOP_NAME': 'Flinders Street', 'LATITUDE': ...",Flinders Street,-37.818305,144.966964,Southern Cross,-37.817936,144.951411
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,"POLYGON ((144.96135 -37.84558, 144.96136 -37.8...",0.0,POINT (144.96202 -37.85115),"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495...",Prahran,-37.849518,144.989860,Windsor,-37.856053,144.992035
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,"POLYGON ((144.97018 -37.85372, 144.97066 -37.8...",0.0,POINT (144.97125 -37.85796),"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495...",Prahran,-37.849518,144.989860,Windsor,-37.856053,144.992035
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.02485 -37.85093, 145.02545 -37.8...",2.0,POINT (145.02023 -37.85666),"[{'STOP_NAME': 'Armadale', 'LATITUDE': -37.856...",Armadale,-37.856452,145.019326,Toorak,-37.850774,145.013909
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.97259 -37.79270, 144.97135 -37.7...",0.0,POINT (144.96849 -37.78659),"[{'STOP_NAME': 'Jewell', 'LATITUDE': -37.77498...",Jewell,-37.774987,144.958717,Royal Park,-37.781193,144.952301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.56107 -38.18789, 146.56057 -38.1...",1.0,POINT (146.51978 -38.20334),"[{'STOP_NAME': 'Traralgon', 'LATITUDE': -38.19...",Traralgon,-38.198885,146.537882,Morwell,-38.236719,146.396753
564,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,"POLYGON ((146.31017 -36.38756, 146.31014 -36.3...",1.0,POINT (146.31601 -36.36938),"[{'STOP_NAME': 'Wangaratta', 'LATITUDE': -36.3...",Wangaratta,-36.355101,146.317038,Springhurst,-36.185893,146.470417
565,Other Regional Centres,Warragul,22698,Warragul,55.3164,"POLYGON ((145.94460 -38.12758, 145.94458 -38.1...",1.0,POINT (145.92997 -38.15372),"[{'STOP_NAME': 'Warragul', 'LATITUDE': -38.165...",Warragul,-38.165229,145.932674,Drouin,-38.136452,145.855947
566,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.46039 -38.33168, 142.47873 -38.3...",2.0,POINT (142.49818 -38.36886),"[{'STOP_NAME': 'Warrnambool', 'LATITUDE': -38....",Warrnambool,-38.385014,142.475545,Warrnambool,-38.386392,142.538871


In [26]:
#output the data
all_suburbs_gdf.to_csv("../../data/curated/station_data_aggregated.csv")