### This file is dedicated to calculate route distance from each property to nearest library

Created by Yuecheng Wang 13-09-2024

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import calculate_distance_car, find_closest_lib

In [3]:
lib = pd.read_csv('../../data/raw/three_external/library_data.csv')

In [4]:
lib.head(5)

Unnamed: 0,NAME,FEATSUBTYP,latitude,longitude
0,HORSHAM LIBRARY,library,-36.712876,142.2009
1,QUEENSCLIFF LIBRARY,library,-38.267751,144.661309
2,SOUTH OAKLEIGH LIBRARY,library,-37.927397,145.081978
3,LANCEFIELD LIBRARY,library,-37.276917,144.735522
4,ARARAT LIBRARY,library,-37.283718,142.934432


In [5]:
# readin domain data
file_path = "../../data/raw/domain/all_properties_preprocessed.csv"
domain_data = pd.read_csv(file_path)

In [6]:
# test on first 5
sample_domain = domain_data.head(5)

In [7]:
route_distances = []
for index, row in sample_domain.iterrows():
    property_coords = (row['Latitude'], row['Longitude'])
    
    closest_lib = find_closest_lib(property_coords, lib.copy())
    
    if isinstance(closest_lib, int):
        # Handle invalid coordinates
        print(f"Skipping property at index {index} due to invalid coordinates.")
        route_distances.append(-1)  # Store a placeholder value
        continue
        
    # Get closest lib coordinates
    lib_coords = (closest_lib['latitude'], closest_lib['longitude'])
    
    # Calculate route distance using OSRM API
    route_distance = calculate_distance_car(row, lib_coords)

    print(closest_lib)
    print(route_distance)
    print("------")

NAME          STATE LIBRARY OF VICTORIA
FEATSUBTYP                      library
latitude                     -37.809782
longitude                    144.965189
distance                     729.126112
Name: 156, dtype: object
1.5615
------
NAME          LIBRARY ACCESS POINT LIBRARY
FEATSUBTYP                         library
latitude                        -37.819376
longitude                       144.947233
distance                        921.310017
Name: 26, dtype: object
1.7147000000000001
------
NAME          STATE LIBRARY OF VICTORIA
FEATSUBTYP                      library
latitude                     -37.809782
longitude                    144.965189
distance                     417.357474
Name: 156, dtype: object
1.1995
------
NAME          STATE LIBRARY OF VICTORIA
FEATSUBTYP                      library
latitude                     -37.809782
longitude                    144.965189
distance                      372.16768
Name: 156, dtype: object
1.1302999999999999
------
NAME  

In [8]:
# Split the dataset into 10 parts
num_parts = 20
split_data = np.array_split(domain_data, num_parts)

# List to store paths of temporary files
temp_files = []

# Process each part separately
for i, part in enumerate(split_data):
    temp_file = f'../../data/raw/domain/lib_distance_part_{i+1}.csv'
    
    # Check if the file already exists
    if os.path.exists(temp_file):
        print(f"File for Part {i+1} already exists. Skipping processing for this part.")
        temp_files.append(temp_file)
        continue

    route_distances = []

    # Iterate through the part and calculate distances
    for index, row in tqdm(part.iterrows(), total=len(part), desc=f"Processing Part {i+1}"):
        property_coords = (row['Latitude'], row['Longitude'])
        
        closest_lib = find_closest_lib(property_coords, lib.copy())
        
        if isinstance(closest_lib, int):
            # Handle invalid coordinates
            print(f"Skipping property at index {index} due to invalid coordinates.")
            route_distances.append(-1)  # Store a placeholder value
            continue
            
        # Get closest lib coordinates
        lib_coords = (closest_lib['latitude'], closest_lib['longitude'])
        
        # Calculate route distance using OSRM API
        route_distance = calculate_distance_car(row, lib_coords)
        route_distances.append(route_distance)

    # Create a DataFrame for the current part's distances
    route_distance_df = pd.DataFrame(route_distances, columns=['route_distance_km'])

    # Save the current part to a temporary file
    route_distance_df.to_csv(temp_file, index=False)
    temp_files.append(temp_file)

    # Print completion message for each part
    print(f"Completed processing Part {i+1}/{num_parts}.")

# Combine all parts into one large file
combined_df = pd.concat([pd.read_csv(file) for file in temp_files])


output_relative_dir = '../../data/raw/domain/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Save the combined DataFrame
combined_file = '../../data/raw/domain/lib_distance.csv'
combined_df.to_csv(combined_file, index=False)


  return bound(*args, **kwds)
Processing Part 1:   2%|â–ˆ                                                               | 7/401 [00:08<08:10,  1.25s/it]


KeyboardInterrupt: 

In [None]:
# Delete the temporary files
for file in temp_files:
    os.remove(file)

print(f"Combined file saved at {combined_file}. Temporary files deleted.")