### This file is dedicated to calculate route distance from each property to nearest park/reserve

Created by Yuecheng Wang 13-09-2024

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import parse_coordinate, calculate_distance_car, find_closest_park

In [3]:
park = pd.read_csv('../../data/raw/three_external/park_data_cleaned.csv')

In [4]:
park.head(5)

Unnamed: 0,NAME,FEATSUBTYP,latitude,longitude
0,CATANI GARDENS,park,-37.861552,144.971729
1,LIONS PARK,park,-36.727705,146.962615
2,GILLIFLOWER STREET PARK,park,-37.820009,144.660309
3,ADRIAN DANAHER RESERVE,park,-37.836475,145.202112
4,BELEURA PARK,park,-38.221566,145.054413


In [5]:
# readin domain data
file_path = "../../data/raw/domain/all_postcodes.csv"
domain_data = pd.read_csv(file_path)

In [6]:
# test on first 5
sample_domain = domain_data.head(5)

In [7]:
route_distances = []
for index, row in sample_domain.iterrows():
    property_coords = parse_coordinate(row['Coordinates'])
    
    closest_park = find_closest_park(property_coords, park.copy())
    
    if isinstance(closest_park, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest park coordinates
    park_coords = (closest_park['latitude'], closest_park['longitude'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, park_coords)

    print(closest_park)
    print(route_distance)
    print("------")

NAME          FRANKLIN STREET RESERVE
FEATSUBTYP                       park
latitude                   -37.809221
longitude                  144.957916
distance                   188.378783
Name: 3818, dtype: object
0.3633
------
NAME          FRANKLIN STREET RESERVE
FEATSUBTYP                       park
latitude                   -37.809221
longitude                  144.957916
distance                   564.537138
Name: 3818, dtype: object
1.0145
------
NAME          FRANKLIN STREET RESERVE
FEATSUBTYP                       park
latitude                   -37.809221
longitude                  144.957916
distance                   267.472926
Name: 3818, dtype: object
0.4621
------
NAME          FRANKLIN STREET RESERVE
FEATSUBTYP                       park
latitude                   -37.809221
longitude                  144.957916
distance                   279.663046
Name: 3818, dtype: object
0.39280000000000004
------
NAME          FRANKLIN STREET RESERVE
FEATSUBTYP                   

In [8]:
# Split the dataset into 10 parts
num_parts = 200
split_data = np.array_split(domain_data, num_parts)

# List to store paths of temporary files
temp_files = []

# Process each part separately
for i, part in enumerate(split_data):
    temp_file = f'../../data/raw/domain/park_distance_part_{i+1}.csv'
    
    # Check if the file already exists
    if os.path.exists(temp_file):
        print(f"File for Part {i+1} already exists. Skipping processing for this part.")
        temp_files.append(temp_file)
        continue

    route_distances = []

    # Iterate through the part and calculate distances
    for index, row in tqdm(part.iterrows(), total=len(part), desc=f"Processing Part {i+1}"):
        property_coords = parse_coordinate(row['Coordinates'])
        
        closest_park = find_closest_park(property_coords, park.copy())
        
        if isinstance(closest_park, int):
            # Handle invalid coordinates
            print(f"Skipping property at index {index} due to invalid coordinates.")
            route_distances.append(-1)  # Store a placeholder value
            continue
            
        # Get closest park coordinates
        park_coords = (closest_park['latitude'], closest_park['longitude'])
        
        # Calculate route distance using OSRM API
        route_distance = calculate_distance_car(row, park_coords)
        route_distances.append(route_distance)

    # Create a DataFrame for the current part's distances
    route_distance_df = pd.DataFrame(route_distances, columns=['route_distance_km'])

    # Save the current part to a temporary file
    route_distance_df.to_csv(temp_file, index=False)
    temp_files.append(temp_file)

    # Print completion message for each part
    print(f"Completed processing Part {i+1}/{num_parts}.")

# Combine all parts into one large file
combined_df = pd.concat([pd.read_csv(file) for file in temp_files])


output_relative_dir = '../../data/raw/domain/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Save the combined DataFrame
combined_file = '../../data/raw/domain/park_distance.csv'
combined_df.to_csv(combined_file, index=False)

# Delete the temporary files
for file in temp_files:
    os.remove(file)

print(f"Combined file saved at {combined_file}. Temporary files deleted.")

  return bound(*args, **kwds)
Processing Part 1: 100%|████████████████████████████████████████████████████████████████| 41/41 [01:14<00:00,  1.82s/it]


Completed processing Part 1/200.


Processing Part 2:   2%|█▌                                                               | 1/41 [00:02<01:59,  3.00s/it]


KeyboardInterrupt: 