### This file is dedicated to calculate route distance from each property to nearest tourist attraction

Created by Yuecheng Wang 13-09-2024

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import parse_coordinate, calculate_distance_car, find_closest_tour

In [3]:
tour = pd.read_csv('../../data/raw/three_external/tourist attraction_data.csv')

In [4]:
tour.head(5)

Unnamed: 0,NAME,FEATSUBTYP,latitude,longitude
0,THE ABATTOIRS,tourist attraction,-37.880153,144.978421
1,J WARD,tourist attraction,-37.278665,142.93027
2,LABASSA MANSION,tourist attraction,-37.869771,145.009807
3,HERONSWOOD,tourist attraction,-38.344039,144.943934
4,PIRRA HOMESTEAD,tourist attraction,-38.009345,144.410613


In [5]:
# readin domain data
file_path = "../../data/raw/domain/all_properties_preprocessed.csv"
domain_data = pd.read_csv(file_path)

In [6]:
# test on first 5
sample_domain = domain_data.head(5)

In [7]:
route_distances = []
for index, row in sample_domain.iterrows():
    property_coords = (row['Latitude'], row['Longitude'])
    
    closest_tour = find_closest_tour(property_coords, tour.copy())
    
    if isinstance(closest_tour, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest tour coordinates
    tour_coords = (closest_tour['latitude'], closest_tour['longitude'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, tour_coords)

    print(closest_tour)
    print(route_distance)
    print("------")

NAME          OLD MELBOURNE GAOL
FEATSUBTYP    tourist attraction
latitude              -37.807776
longitude             144.965371
distance              807.942051
Name: 275, dtype: object
1.3194000000000001
------
NAME          MELBOURNE AQUARIUM
FEATSUBTYP    tourist attraction
latitude              -37.821043
longitude             144.958429
distance               874.46471
Name: 5, dtype: object
0.8704
------
NAME          OLD MELBOURNE GAOL
FEATSUBTYP    tourist attraction
latitude              -37.807776
longitude             144.965371
distance              410.755304
Name: 275, dtype: object
0.9574
------
NAME          OLD MELBOURNE GAOL
FEATSUBTYP    tourist attraction
latitude              -37.807776
longitude             144.965371
distance              401.617801
Name: 275, dtype: object
0.8882000000000001
------
NAME          OLD MELBOURNE GAOL
FEATSUBTYP    tourist attraction
latitude              -37.807776
longitude             144.965371
distance              403.8192

In [8]:
# Split the dataset into 10 parts
num_parts = 20
split_data = np.array_split(domain_data, num_parts)

# List to store paths of temporary files
temp_files = []

# Process each part separately
for i, part in enumerate(split_data):
    temp_file = f'../../data/raw/domain/tour_distance_part_{i+1}.csv'
    
    # Check if the file already exists
    if os.path.exists(temp_file):
        print(f"File for Part {i+1} already exists. Skipping processing for this part.")
        temp_files.append(temp_file)
        continue

    route_distances = []

    # Iterate through the part and calculate distances
    for index, row in tqdm(part.iterrows(), total=len(part), desc=f"Processing Part {i+1}"):
        property_coords = (row['Latitude'], row['Longitude'])
        
        closest_tour = find_closest_tour(property_coords, tour.copy())
        
        if isinstance(closest_tour, int):
            # Handle invalid coordinates
            print(f"Skipping property at index {index} due to invalid coordinates.")
            route_distances.append(-1)  # Store a placeholder value
            continue
            
        # Get closest tour coordinates
        tour_coords = (closest_tour['latitude'], closest_tour['longitude'])
        
        # Calculate route distance using OSRM API
        route_distance = calculate_distance_car(row, tour_coords)
        route_distances.append(route_distance)

    # Create a DataFrame for the current part's distances
    route_distance_df = pd.DataFrame(route_distances, columns=['route_distance_km'])

    # Save the current part to a temporary file
    route_distance_df.to_csv(temp_file, index=False)
    temp_files.append(temp_file)

    # Print completion message for each part
    print(f"Completed processing Part {i+1}/{num_parts}.")

# Combine all parts into one large file
combined_df = pd.concat([pd.read_csv(file) for file in temp_files])


output_relative_dir = '../../data/raw/domain/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Save the combined DataFrame
combined_file = '../../data/raw/domain/tour_distance.csv'
combined_df.to_csv(combined_file, index=False)


  return bound(*args, **kwds)
Processing Part 1:   5%|██▉                                                            | 19/401 [00:24<08:14,  1.29s/it]


KeyboardInterrupt: 

In [None]:
# Delete the temporary files
for file in temp_files:
    os.remove(file)

print(f"Combined file saved at {combined_file}. Temporary files deleted.")