### This file is dedicated to calculate distance of individual property to cbd using online routing services via API calls
Created by Yuecheng Wang, 12-09-2024

In [1]:
import pandas as pd 
from tqdm import tqdm

In [2]:
# import scripts for api calculation
import sys
import os

# Add the scripts folder to the Python path
scripts_path = os.path.abspath('../../scripts')

if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from api_function import parse_coordinate, calculate_distance_car

In [3]:
# Melbourne CBD coordinate
# this is literally found from type Melbourne CBD coord on Google: 37.8136° S, 144.9631° E
melb_coord = [-37.8136, 144.9631]

In [4]:
# using https://project-osrm.org/ for calculate route
api_base_url = "https://project-osrm.org"

# dataset 
file_path = "../../data/raw/domain/all_postcodes.csv"
domain_data = pd.read_csv(file_path)

In [5]:
domain_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8002 entries, 0 to 8001
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Address                       8002 non-null   object 
 1   Cost                          8002 non-null   object 
 2   Bedrooms                      7979 non-null   float64
 3   Bathrooms                     7979 non-null   float64
 4   Coordinates                   8002 non-null   object 
 5   Closest Gov Secondary School  6907 non-null   object 
 6   Gov Secondary Distance        6907 non-null   object 
 7   Age under 20                  7802 non-null   object 
 8   Age 20-39                     7802 non-null   object 
 9   Age 40-59                     7802 non-null   object 
 10  Age 60+                       7802 non-null   object 
 11  Postcode                      8002 non-null   int64  
dtypes: float64(2), int64(1), object(9)
memory usage: 750.3+ KB


as all coordinates are present, we use all data \
first, try with first 5

In [6]:
sample_properties = domain_data.head(5).copy()

In [7]:
sample_properties['CBD distance(km)'] = sample_properties.apply(calculate_distance_car, des_coords=melb_coord, axis=1)

In [8]:
sample_properties

Unnamed: 0,Address,Cost,Bedrooms,Bathrooms,Coordinates,Closest Gov Secondary School,Gov Secondary Distance,Age under 20,Age 20-39,Age 40-59,Age 60+,Postcode,CBD distance(km)
0,"901/22-40 Wills Street, Melbourne VIC 3000",$600 per week,1.0,1.0,"[-37.8107551, 144.9570001]",University High School,1.5 km away,8%,77%,12%,3%,3000,0.9271
1,"1207/270 King Street, Melbourne VIC 3000",$720 per week,2.0,2.0,"[-37.8136918, 144.9548583]",University High School,1.9 km away,6%,83%,10%,1%,3000,0.9381
2,"5809/442 ELIZABETH STREET, Melbourne VIC 3000",$850 Per Week ( Fully Furnished),2.0,1.0,"[-37.8084101, 144.9607759]",University High School,1.3 km away,3%,90%,7%,0%,3000,0.6115
3,"2112/80 A'beckett Street, Melbourne VIC 3000",$700 per week,2.0,2.0,"[-37.8089991, 144.9610792]",University High School,1.4 km away,3%,90%,7%,0%,3000,0.5422
4,"1210/81 A'beckett Street, Melbourne VIC 3000",$650 weekly,2.0,1.0,"[-37.8092536, 144.961181]",University High School,1.4 km away,6%,79%,12%,3%,3000,0.5421


This works fine, now, do it with full dataset to include CBD there.

In [9]:
tqdm.pandas()

CBD_distance = domain_data.progress_apply(calculate_distance_car, des_coords=melb_coord, axis=1)

# Store distance as independent file for merge later
CBD_distance_df = CBD_distance.to_frame(name='CBD distance(km)')

  3%|██▋                                                                           | 274/8002 [05:38<2:35:46,  1.21s/it]

Error fetching route data: 400 Client Error: Bad Request for url: http://router.project-osrm.org/route/v1/driving/0.0,0.0;144.9631,-37.8136?overview=false&alternatives=false&steps=false&annotations=distance


 99%|████████████████████████████████████████████████████████████████████████████ | 7910/8002 [2:44:47<01:51,  1.21s/it]

Error fetching route data: 400 Client Error: Bad Request for url: http://router.project-osrm.org/route/v1/driving/0.0,0.0;144.9631,-37.8136?overview=false&alternatives=false&steps=false&annotations=distance


100%|█████████████████████████████████████████████████████████████████████████████| 8002/8002 [2:46:46<00:00,  1.25s/it]


Once finish, store in a new dataset.

In [11]:
CBD_distance_df.to_csv('../../data/raw/domain/cbd_distance.csv', index=False)