### This file is dedicated to calculate distance from each property to cloest train station
('train station' as both Metropolitan Train and Regional Train combined

Created by Yuecheng Wang 12 Sept 2024

In [11]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import calculate_distance_car, find_closest_station

In [3]:
metro_train = pd.read_parquet('../../data/raw/PTV/un_preprocess/Metropolitan Train.parquet')
region_train = pd.read_parquet('../../data/raw/PTV/un_preprocess/Regional Train.parquet')

In [4]:
combined_train = pd.concat([metro_train, region_train], ignore_index=True)

In [5]:
combined_train.head(5)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,15351,Sunbury Railway Station (Sunbury),-37.579091,144.727319
1,15353,Diggers Rest Railway Station (Diggers Rest),-37.627017,144.719922
2,19827,Stony Point Railway Station (Crib Point),-38.374235,145.221837
3,19828,Crib Point Railway Station (Crib Point),-38.366123,145.204043
4,19829,Morradoo Railway Station (Crib Point),-38.354033,145.189602


In [6]:
# readin domain data
file_path = "../../data/raw/domain/all_properties_preprocessed.csv"
domain_data = pd.read_csv(file_path)

Test on first 5 rows first.

In [7]:
sample_domain = domain_data.head(5)

In [8]:
route_distances = []
for index, row in sample_domain.iterrows():
    property_coords = (row['Latitude'], row['Longitude'])
    
    closest_station = find_closest_station(property_coords, combined_train.copy())

    
    if isinstance(closest_station, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest station coordinates
    station_coords = (closest_station['stop_lat'], closest_station['stop_lon'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, station_coords)

    print(closest_station)
    print(route_distance)
    print("------")
    # Store results
    route_distances.append(route_distance)

stop_id                                  19958
stop_name    Prahran Railway Station (Prahran)
stop_lat                            -37.849518
stop_lon                             144.98986
distance                           2595.233457
Name: 131, dtype: object
4.264
------
stop_id                                                20043
stop_name    Southern Cross Railway Station (Melbourne City)
stop_lat                                          -37.818334
stop_lon                                          144.952525
distance                                         2423.963443
Name: 228, dtype: object
2.9979
------
stop_id                                                20043
stop_name    Southern Cross Railway Station (Melbourne City)
stop_lat                                          -37.818334
stop_lon                                          144.952525
distance                                         2601.832507
Name: 228, dtype: object
3.3571999999999997
------
stop_id                    

Now do with all data.

In [13]:
# Split the dataset into 20 parts
num_parts = 20
split_data = np.array_split(domain_data, num_parts)

# List to store paths of temporary files
temp_files = []

# Process each part separately
for i, part in enumerate(split_data):
    temp_file = f'../../data/raw/domain/train_distance_part_{i+1}.csv'
    
    # Check if the file already exists
    if os.path.exists(temp_file):
        print(f"File for Part {i+1} already exists. Skipping processing for this part.")
        temp_files.append(temp_file)
        continue

    route_distances = []

    # Iterate through the part and calculate distances
    for index, row in tqdm(part.iterrows(), total=len(part), desc=f"Processing Part {i+1}"):
        property_coords = (row['Latitude'], row['Longitude'])
        closest_train = find_closest_station(property_coords, combined_train.copy())

        if isinstance(closest_train, int):
            print(f"Skipping property at index {index} due to invalid coordinates.")
            route_distances.append(-1)
            continue
        
        train_coords = (closest_station['stop_lat'], closest_station['stop_lon'])
        route_distance = calculate_distance_car(row, train_coords)
        route_distances.append(route_distance)

    # Create a DataFrame for the current part's distances
    route_distance_df = pd.DataFrame(route_distances, columns=['route_distance_km'])

    # Save the current part to a temporary file
    route_distance_df.to_csv(temp_file, index=False)
    temp_files.append(temp_file)

    # Print completion message for each part
    print(f"Completed processing Part {i+1}/{num_parts}.")

# Combine all parts into one large file
combined_df = pd.concat([pd.read_csv(file) for file in temp_files])


output_relative_dir = '../../data/raw/domain/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Save the combined DataFrame
combined_file = '../../data/raw/domain/train_distance.csv'
combined_df.to_csv(combined_file, index=False)


  return bound(*args, **kwds)
Processing Part 1: 100%|██████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.43s/it]


Completed processing Part 1/2000.


Processing Part 2:   0%|                                                                          | 0/4 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Delete the temporary files
for file in temp_files:
    os.remove(file)

print(f"Combined file saved at {combined_file}. Temporary files deleted.")