### Import Library

In [1]:
import pandas as pd
import openrouteservice as ors
import numpy as np
import math
from top_3_nearest import LocationFinder
from geopy.distance import geodesic
import time
import os

### Find the minimum distance and the minimum duration to the nearest school for each rental property

In [2]:
# Initialize the LocationFinder instance
finder = LocationFinder()

In [3]:
def cal_distance_duration_batch_large(coordinates_list, client):
    '''
    This function calculates the driving distance and duration for multiple rental locations to all train stations
    in batches where origin x destination <= 3500.
    '''
    batch_size = 50  
    num_batches = int(np.ceil(len(coordinates_list) / batch_size))
    
    all_distances = []
    all_durations = []
    
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(coordinates_list))
        batch_coordinates = coordinates_list[batch_start:batch_end]
        
        matrix = client.distance_matrix(
            locations=batch_coordinates,
            destinations=[i for i in range(len(batch_coordinates))],
            profile='driving-car',
            metrics=['distance', 'duration'],
            validate=False,
        )
        
        distances = matrix['distances']
        durations = matrix['durations']
        
        all_distances.extend(distances)
        all_durations.extend(durations)
    
    return all_distances, all_durations

In [4]:
def cal_distance_duration_batch_small(coordinates_list, client):
    '''
    This function calculates the driving distance and duration for multiple rental locations to all train stations
    in smaller batches where locations <= 25.
    '''
    max_locations = 25
    num_batches = int(math.ceil(len(coordinates_list) / max_locations))
    
    all_distances = []
    all_durations = []
    
    for i in range(num_batches):
        batch_start = i * max_locations
        batch_end = min((i + 1) * max_locations, len(coordinates_list))
        batch_coordinates = coordinates_list[batch_start:batch_end]
        
        for j in range(0, len(batch_coordinates), max_locations):
            sub_batch_end = min(j + max_locations, len(batch_coordinates))
            sub_batch = batch_coordinates[j:sub_batch_end]
            
            matrix = client.distance_matrix(
                locations=sub_batch,
                destinations=[i for i in range(len(sub_batch))],
                profile='driving-car',
                metrics=['distance', 'duration'],
                validate=False,
            )
            
            distances = matrix['distances']
            durations = matrix['durations']
            
            all_distances.extend(distances)
            all_durations.extend(durations)
    
    return all_distances, all_durations

In [5]:
# Prepare the data
school = pd.read_csv("../data/curated/school_location_cleaned_2023.csv")
# Rename X and Y to longitude and latitude, and remove spaces in the column names
school.rename(columns={'X': 'longitude', 'Y': 'latitude'}, inplace=True)
school.columns = school.columns.str.strip()
school.to_csv('../data/curated/school_location_cleaned_2023_modified.csv', index=False)

rent_cleaned = pd.read_csv("../data/curated/rent_cleaned.csv")

In [6]:
data_num = len(rent_cleaned)
print(data_num)

2962


### All api_keys need

In [7]:
# api_keys = [
#     '5b3ce3597851110001cf624836637fed137740e8acff29bb224a6097',
#     '5b3ce3597851110001cf6248cf89de10dc754c9bb6f67409d675e26d',
#     '5b3ce3597851110001cf62483161f33d5d2c4bf89b53d50c7a004c14',
#     '5b3ce3597851110001cf62485dd4c2d56a644502b1e22ac6f42fee55',
#     '5b3ce3597851110001cf62486095da9f835d4db4b2a9dcce2c6f872b',
#     '5b3ce3597851110001cf6248a63ba42f01324f9298b2419740c5000e',
#     '5b3ce3597851110001cf62483da649ff61f04e7897e471e310b4ac2a',
#     '5b3ce3597851110001cf62480969bb3ee47846b3b8973a37a9b2b906'
# ]

In [8]:
# Select the entire dataset of total 2962 rows and split into blocks of 200 rows each
reduced_df = rent_cleaned.iloc[:data_num, :].copy()
rent_dfs = np.array_split(reduced_df, int(np.ceil(len(reduced_df) / 200)))  # Split into blocks of 200 rows each\

  return bound(*args, **kwds)


In [9]:
import os

# Initialize the processed counter
processed_count = 0
key_index = 0
client = ors.Client(key=api_keys[key_index])
max_requests_perkey = 400 
total_requests = 0
csv_file = '../data/curated/nearst_school_info.csv'

# Loop through each block (each with 200 rows)
for df in rent_dfs:
    df['id'] = df.index + 1
    df.columns = df.columns.str.strip()

    min_distances = []
    min_durations = []

    # Process the data in smaller batches of 40 rows at a time within each block
    batch_size = 40
    num_batches = int(np.ceil(len(df) / batch_size))
    
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]

        for index, rental in batch_df.iterrows():
            latitude, longitude = rental['latitude'], rental['longitude']

            # Use top_3_nearest to find the 3 nearest schools
            top_3_schools = finder.top_3_nearest(latitude, longitude, '../data/curated/school_location_cleaned_2023_modified.csv')

            # Get the coordinates of the rental property and the nearest 3 schools
            schools = list(zip(top_3_schools['longitude'], top_3_schools['latitude']))
            all_coordinates = [(longitude, latitude)] + schools

            # Calculate distances and durations
            if len(all_coordinates) <= 3500:
                distances, durations = cal_distance_duration_batch_large(all_coordinates, client)
            else:
                distances, durations = cal_distance_duration_batch_small(all_coordinates, client)

            min_distance = min([d for d in distances[0] if d > 0])
            min_duration = min([t for t in durations[0] if t > 0])

            # Append the calculated values to the lists
            min_distances.append(min_distance)
            min_durations.append(min_duration)

            # Update the processed counter
            processed_count += 1
            total_requests += 1
            
            # Check if the current key has reached the request limit, and switch to the next key after exceeding 400 times
            if total_requests >= max_requests_perkey:
                key_index = (key_index + 1) % len(api_keys)
                client = ors.Client(key=api_keys[key_index])
                total_requests = 0
                print(f"switch to next API key: {api_keys[key_index]}")

            # After processing every 40 rows, sleep for 70 seconds
            if processed_count % batch_size == 0:
                print(f"Processed {processed_count} rentals, resting for 70 seconds...")
                time.sleep(70)

    # Assign the lists to the DataFrame columns for the current block
    df['min_distance_to_school(m)'] = min_distances
    df['min_duration_to_school(s)'] = min_durations

    result_df = df[['id', 'min_distance_to_school(m)', 'min_duration_to_school(s)']]
    
    # save
    if os.path.exists(csv_file):
        result_df.to_csv(csv_file, mode='a', index=False, header=False)
    else:
        result_df.to_csv(csv_file, mode='w', index=False, header=True)

Processed 40 rentals, resting for 70 seconds...
Processed 80 rentals, resting for 70 seconds...
Processed 120 rentals, resting for 70 seconds...
Processed 160 rentals, resting for 70 seconds...
Processed 200 rentals, resting for 70 seconds...
Processed 240 rentals, resting for 70 seconds...
Processed 280 rentals, resting for 70 seconds...
Processed 320 rentals, resting for 70 seconds...
Processed 360 rentals, resting for 70 seconds...
switch to next API key: 5b3ce3597851110001cf6248cf89de10dc754c9bb6f67409d675e26d
Processed 400 rentals, resting for 70 seconds...
Processed 440 rentals, resting for 70 seconds...
Processed 480 rentals, resting for 70 seconds...
Processed 520 rentals, resting for 70 seconds...
Processed 560 rentals, resting for 70 seconds...
Processed 600 rentals, resting for 70 seconds...
Processed 640 rentals, resting for 70 seconds...
Processed 680 rentals, resting for 70 seconds...
Processed 720 rentals, resting for 70 seconds...
Processed 760 rentals, resting for 70 s

In [10]:
print(df[['id', 'min_distance_to_school(m)', 'min_duration_to_school(s)']])

        id  min_distance_to_school(m)  min_duration_to_school(s)
2765  2766                     178.78                      20.48
2766  2767                    1166.58                     122.82
2767  2768                    1588.22                     152.86
2768  2769                    1524.62                     203.11
2769  2770                    1506.79                     189.10
...    ...                        ...                        ...
2957  2958                     801.29                     108.71
2958  2959                     677.64                      90.91
2959  2960                     374.77                      89.94
2960  2961                     532.74                      91.65
2961  2962                     780.51                     122.86

[197 rows x 3 columns]
