### This file is dedicated to calculate distance from each property to cloest train station
('train station' as both Metropolitan Train and Regional Train combined

Created by Yuecheng Wang 12 Sept 2024

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import parse_coordinate, calculate_distance_car, find_closest_station

In [3]:
metro_train = pd.read_parquet('../../data/raw/PTV/un_preprocess/Metropolitan Train.parquet')
region_train = pd.read_parquet('../../data/raw/PTV/un_preprocess/Regional Train.parquet')

In [4]:
combined_train = pd.concat([metro_train, region_train], ignore_index=True)

In [5]:
combined_train.head(5)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,15351,Sunbury Railway Station (Sunbury),-37.579091,144.727319
1,15353,Diggers Rest Railway Station (Diggers Rest),-37.627017,144.719922
2,19827,Stony Point Railway Station (Crib Point),-38.374235,145.221837
3,19828,Crib Point Railway Station (Crib Point),-38.366123,145.204043
4,19829,Morradoo Railway Station (Crib Point),-38.354033,145.189602


In [6]:
# readin domain data
file_path = "../../data/raw/domain/all_postcodes.csv"
domain_data = pd.read_csv(file_path)

Test on first 5 rows first.

In [7]:
sample_domain = domain_data.head(5)

In [8]:
route_distances = []
for index, row in sample_domain.iterrows():
    property_coords = parse_coordinate(row['Coordinates'])
    
    closest_station = find_closest_station(property_coords, combined_train.copy())

    
    if isinstance(closest_station, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest station coordinates
    station_coords = (closest_station['stop_lat'], closest_station['stop_lon'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, station_coords)

    print(closest_station)
    print(route_distance)
    print("------")
    # Store results
    route_distances.append(route_distance)

stop_id                                           19841
stop_name    Flagstaff Railway Station (Melbourne City)
stop_lat                                     -37.811981
stop_lon                                     144.955654
distance                                     180.495113
Name: 16, dtype: object
0.6166
------
stop_id                                           19841
stop_name    Flagstaff Railway Station (Melbourne City)
stop_lat                                     -37.811981
stop_lon                                     144.955654
distance                                     202.361641
Name: 16, dtype: object
0.5685
------
stop_id                                                  19842
stop_name    Melbourne Central Railway Station (Melbourne C...
stop_lat                                            -37.809939
stop_lon                                            144.962594
distance                                            233.254239
Name: 17, dtype: object
1.4196
------
stop_id    

Now do with all data.

In [9]:
route_distances = []
for index, row in tqdm(domain_data.iterrows(), total=len(domain_data), desc="Processing Properties"):
    property_coords = parse_coordinate(row['Coordinates'])
    
    closest_station = find_closest_station(property_coords, combined_train.copy())

    
    if isinstance(closest_station, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest station coordinates
    station_coords = (closest_station['stop_lat'], closest_station['stop_lon'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, station_coords)

    # Store results
    route_distances.append(route_distance)

Processing Properties:   3%|█▊                                                     | 272/8002 [05:40<2:40:10,  1.24s/it]

Skipping property at index 272 due to invalid coordinates.


Processing Properties:  99%|█████████████████████████████████████████████████████▎| 7908/8002 [2:44:38<01:55,  1.23s/it]

Skipping property at index 7908 due to invalid coordinates.


Processing Properties: 100%|██████████████████████████████████████████████████████| 8002/8002 [2:46:33<00:00,  1.25s/it]


In [10]:
route_distance_df = pd.DataFrame(route_distances, columns=['route_distance_km'])

# Save the route distances alongside your domain data
route_distance_df.to_csv('../../data/raw/domain/train_distance.csv', index=False)