### Jupyter Notebook for partially joining domain and distance data
- This jupyter notebook aims to retrieve distance data from the housing to the closest amenities around the area.
- It uses open route services api to calculate both walking distance and car distance 
- Note that the limit for open route services api 

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import json
import re
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import requests
import time

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Open Route Services")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.port", "6066")
    .getOrCreate()
)

24/10/02 11:49:41 WARN Utils: Your hostname, Nigels-Mac-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.186 instead (on interface en0)
24/10/02 11:49:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 11:49:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Set Up your Open Route Service Key
- insert your json file with your ORS(open route service) key into the landing folder

In [2]:
# Load the config file
with open('../data/landing/openrouteservicekey.json') as config_file:
    config = json.load(config_file)

# Access the API key
api_key = config.get('api_key')

if not api_key:
    raise ValueError("API Key not found in openrouteservicekey.json.")

**This is prior code for one to one api calls**
- No longer used
- Simply kept here for reference

In [3]:
# # Profile: foot-walking, driving-car
# def openRouteDistance(start_coords, end_coords, profile):
#     # Define the OpenRouteService API endpoint for routing
#     url = 'https://api.openrouteservice.org/v2/directions/{profile}/geojson'

#     # Set up headers with your API key
#     headers = {
#         'Authorization': api_key,
#         'Content-Type': 'application/json'
#     }

#     # Create the payload with the start and end coordinates
#     payload = {
#         "coordinates": [
#             list(start_coords),
#             list(end_coords)
#         ]
#     }

#     # Make the request
#     response = requests.post(url.format(profile=profile), json=payload, headers=headers)

#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the response JSON
#         data = response.json()
        
#         # Extract distance (in meters) from the response
#         distance = data['features'][0]['properties']['segments'][0]['distance']
#         distance_km = distance / 1000  # Convert to kilometers
        
#         return (distance_km)
#     else:
#         return None

**Current Function to get matrix one to many distances**

In [4]:
# Function to get matrix distances
def get_matrix_distances(client, source, destinations, profile='foot-walking'):
    """Returns distances from the source to all destinations."""
    distance = []
    coords = [source] + destinations  # Combine source and destination coordinates
    matrix = client.distance_matrix(
        locations=coords,
        profile=profile,
        sources=[0],  # Index of the house (source)
        destinations=list(range(1, len(coords)))  # Indexes of the amenities (destinations)
    )
    
    for destination in matrix['destinations']:
        try:
            distance.append(destination['snapped_distance'])
        except:
            distance.append(None)

    return distance # Return distances for the first source (house)

In [5]:
# Reading data
cleaned_domain_data = spark.read.parquet("../data/raw/cleaned_domain_current_listings.parquet")
VIC_amenities = spark.read.parquet("../data/raw/VIC_amenities")

                                                                                

In [6]:
# This finds all unique amenities an separate all into individual pyspark dfs
amenity_dfs = {}
unique_types = [row['amenity'] for row in VIC_amenities.select('amenity').distinct().collect()]

for type_value in unique_types:
    filtered_df = VIC_amenities.filter(VIC_amenities.amenity == type_value)
    amenity_dfs[type_value] = filtered_df

                                                                                

**Total amount of data**

In [7]:
cleaned_domain_data.count()

13328

In [8]:
cleaned_domain_data.count()/5

2665.6

# The only thing needed to be changed

In [9]:
# There is only 13328 data and everyone should do around 2666 data to have it all completed
# Just copy paste the data range for each person 
# Chin : [0,2666]
# Nigel : [2666, 5332]
# Cinque : [5332, 7998]
# Malachy : [7998, 10664]
# Jun : [10664, 13328]

data_range = [4262, 5332]

**Getting distance from Open Route Services API**

In [10]:
import openrouteservice
import pandas as pd
import sys
from openrouteservice.exceptions import ApiError
from requests.exceptions import HTTPError

# Convert to Pandas DataFrame
domain_pd_df = cleaned_domain_data.toPandas()[data_range[0]:data_range[1]]
client = openrouteservice.Client(key=api_key)

# Function to calculate distance
def calculate_distance(row, amenities_df):
    house_lat, house_lon = row['latitude'], row['longitude']
    amenities_df['Distance'] = amenities_df.apply(
        lambda x: great_circle((house_lat, house_lon), (x['latitude'], x['longitude'])).km,
        axis=1
    )
    return amenities_df.loc[amenities_df['Distance'].idxmin()]

# Function to handle API rate limits and retries
def get_matrix_distances_with_retries(client, start_coords, end_coords, profile, max_retries=3):
    for attempt in range(max_retries):
        try:
            return get_matrix_distances(client, start_coords, end_coords, profile)
        except ApiError  as e:
            if e.status_code in [502, 503, 504]:
                print(f"Bad Gateway Error ({e.status_code}). Retrying ({attempt + 1}/{max_retries})...")
                time.sleep(2)  # Wait before retrying
            elif e.status_code == 403:  # Quota exceeded (rate limiting)
                print("API quota reached. Exiting program...")
                sys.exit(1)  # Exit the program entirely if quota is reached
            else:
                raise e  # Raise the error if it's not a handled case
    print(f"Max retries reached for {profile} profile. Exiting program...")
    sys.exit(1)  # Exit the program entirely if retries are exhausted

# Find closest amenities for each house
results = []

for index, house_row in domain_pd_df.iterrows():
    start_coords = (house_row['longitude'], house_row['latitude'])
    results.append({
        'House Name': house_row['name']
    })

    # Collecting the coordinates of each amenity
    amenity_coords = []
    amenity_names = []
    for key, value in amenity_dfs.items():
        amenities_pd_df = value.toPandas()
        closest_amenity = calculate_distance(house_row, amenities_pd_df)
        end_coords = (float(closest_amenity['longitude']), float(closest_amenity['latitude']))
        amenity_coords.append(end_coords)
        amenity_names.append(closest_amenity['name'])

    start_time = time.time()
    walking_distances = get_matrix_distances_with_retries(client, 
                                                          start_coords, 
                                                          amenity_coords, 
                                                          'foot-walking', 
                                                          max_retries=3)
    end_time = time.time()
    if (end_time - start_time) < 1.5:
        time.sleep(1.5-(end_time - start_time))

    start_time = time.time()
    driving_distances = get_matrix_distances_with_retries(client, 
                                                          start_coords, 
                                                          amenity_coords, 
                                                          'driving-car', 
                                                          max_retries=3)
    end_time = time.time()
    if (end_time - start_time) < 1.5:
        time.sleep(1.5-(end_time - start_time))
    
    # Inserting data
    j = 0
    _temp_index = index - data_range[0]
    for key, value in amenity_dfs.items():
        results[_temp_index][f'{key}_name'] = amenity_names[j]
        results[_temp_index][f'{key}_walking'] = walking_distances[j]
        results[_temp_index][f'{key}_driving'] = driving_distances[j]
        j += 1
        
    print(f"Done {index}.")

# Saved to a csv file
results_df = pd.DataFrame(results)
results_df.to_csv(f"../data/raw/domain_and_distance_{data_range}.csv", index=False)

                                                                                

Done 4262.
Done 4263.
Done 4264.
Done 4265.
Done 4266.
Done 4267.
Done 4268.
Done 4269.
Done 4270.
Done 4271.
Done 4272.
Done 4273.
Done 4274.
Done 4275.
Done 4276.
Done 4277.
Done 4278.
Done 4279.
Done 4280.
Done 4281.
Done 4282.
Done 4283.
Done 4284.
Done 4285.
Done 4286.
Done 4287.
Done 4288.
Done 4289.
Done 4290.
Done 4291.
Done 4292.
Done 4293.
Done 4294.
Done 4295.
Done 4296.
Done 4297.
Done 4298.
Done 4299.
Done 4300.
Done 4301.
Done 4302.
Done 4303.
Done 4304.
Done 4305.
Done 4306.
Done 4307.
Done 4308.
Done 4309.
Done 4310.
Done 4311.
Done 4312.
Done 4313.
Done 4314.


                                                                                

Done 4315.
Done 4316.
Done 4317.
Done 4318.
Done 4319.
Done 4320.
Done 4321.
Done 4322.
Done 4323.
Done 4324.
Done 4325.
Done 4326.
Done 4327.
Done 4328.
Done 4329.
Done 4330.
Done 4331.
Done 4332.
Done 4333.
Done 4334.
Done 4335.
Done 4336.
Done 4337.
Done 4338.
Done 4339.
Done 4340.
Done 4341.
Done 4342.
Done 4343.
Done 4344.
Done 4345.
Done 4346.
Done 4347.
Done 4348.
Done 4349.
Done 4350.
Done 4351.
Done 4352.
Done 4353.
Done 4354.
Done 4355.
Done 4356.
Done 4357.
Done 4358.
Done 4359.
Done 4360.
Done 4361.
Done 4362.
Done 4363.
Done 4364.
Done 4365.
Done 4366.
Done 4367.
Done 4368.
Done 4369.
Done 4370.
Done 4371.
Done 4372.
Done 4373.
Done 4374.
Done 4375.
Done 4376.
Done 4377.
Done 4378.
Done 4379.
Done 4380.
Done 4381.
Done 4382.
Done 4383.
Done 4384.
Done 4385.
Done 4386.
Done 4387.
Done 4388.
Done 4389.
Done 4390.
Done 4391.
Done 4392.
Done 4393.
Done 4394.
Done 4395.
Done 4396.
Done 4397.
Done 4398.
Done 4399.
Done 4400.
Done 4401.
Done 4402.
Done 4403.
Done 4404.
Done 4405.

                                                                                

Done 4639.
Done 4640.
Done 4641.
Done 4642.
Done 4643.
Done 4644.
Done 4645.
Done 4646.
Done 4647.
Done 4648.
Done 4649.
Done 4650.
Done 4651.
Done 4652.
Done 4653.
Done 4654.
Done 4655.
Done 4656.
Done 4657.
Done 4658.
Done 4659.
Done 4660.
Done 4661.
Done 4662.
Done 4663.
Done 4664.
Done 4665.
Done 4666.
Done 4667.
Done 4668.
Done 4669.
Done 4670.
Done 4671.
Done 4672.
Done 4673.
Done 4674.
Done 4675.
Done 4676.
Done 4677.
Done 4678.
Done 4679.
Done 4680.
Done 4681.
Done 4682.
Done 4683.


                                                                                

Done 4684.
Done 4685.
Done 4686.
Done 4687.
Done 4688.
Done 4689.
Done 4690.
Done 4691.
Done 4692.
Done 4693.
Done 4694.
Done 4695.
Done 4696.
Done 4697.
Done 4698.
Done 4699.
Done 4700.
Done 4701.
Done 4702.
Done 4703.
Done 4704.
Done 4705.
Done 4706.
Done 4707.
Done 4708.
Done 4709.
Done 4710.
Done 4711.
Done 4712.
Done 4713.
Done 4714.
Done 4715.
Done 4716.
Done 4717.
Done 4718.
Done 4719.
Done 4720.
Done 4721.
Done 4722.
Done 4723.
Done 4724.
Done 4725.
Done 4726.
Done 4727.
Done 4728.
Done 4729.
Done 4730.
Done 4731.
Done 4732.
Done 4733.
Done 4734.
Done 4735.
Done 4736.
Done 4737.
Done 4738.


                                                                                

Done 4739.
Done 4740.
Done 4741.
Done 4742.
Done 4743.
Done 4744.
Done 4745.
Done 4746.
Done 4747.
Done 4748.
Done 4749.
Done 4750.
Done 4751.
Done 4752.
Done 4753.
Done 4754.
Done 4755.
Done 4756.
Done 4757.
Done 4758.
Done 4759.
Done 4760.
Done 4761.
Done 4762.
Done 4763.
Done 4764.
Done 4765.
Done 4766.
Done 4767.
Done 4768.
Done 4769.
Done 4770.


                                                                                

Done 4771.
Done 4772.
Done 4773.
Done 4774.
Done 4775.
Done 4776.
Done 4777.
Done 4778.
Done 4779.
Done 4780.
Done 4781.
Done 4782.
Done 4783.
Done 4784.
Done 4785.
Done 4786.
Done 4787.
Done 4788.
Done 4789.
Done 4790.
Done 4791.
Done 4792.
Done 4793.
Done 4794.
Done 4795.
Done 4796.
Done 4797.
Done 4798.
Done 4799.
Done 4800.
Done 4801.
Done 4802.
Done 4803.
Done 4804.
Done 4805.
Done 4806.
Done 4807.
Done 4808.
Done 4809.
Done 4810.
Done 4811.
Done 4812.
Done 4813.
Done 4814.
Done 4815.
Done 4816.
Done 4817.
Done 4818.
Done 4819.
Done 4820.
Done 4821.
Done 4822.
Done 4823.
Done 4824.
Done 4825.
Done 4826.
Done 4827.


AttributeError: 'ApiError' object has no attribute 'status_code'

In [None]:
break

In [11]:
# Saved to a csv file
results_df = pd.DataFrame(results)
results_df.to_csv(f"../data/raw/domain_and_distance_[4262, 4827].csv", index=False)