# School distance to houses

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pyspark.sql import functions as F  #filtering
import geopandas as gpd
import folium
from folium import GeoJsonTooltip
import openrouteservice
import time
from geopy.distance import geodesic
import numpy as np
import pandas as pd


In [4]:
# import the parquet file
house = pd.read_parquet('../data/landing/domain_cleaned/part-00000-7dc1f449-c232-484e-b4e6-1dfbe30d6a77-c000.snappy.parquet')

In [6]:
# import school data
school = pd.read_csv('../data/raw/school.csv')

In [7]:
# percentage of primary schools that are in the government Education Sector
school[school['School_Type'] == 'Primary']['Education_Sector'].value_counts(normalize=True) * 100

Education_Sector
Government     67.587209
Catholic       30.087209
Independent     2.325581
Name: proportion, dtype: float64

In [None]:
def find_closest_primary_school_with_progress(house, school):
    # Filter primary schools
    primary_schools = school[school['School_Type'] == 'Primary'].reset_index(drop=True)
    
    # Convert lat/lon data to NumPy arrays for fast vectorized computation
    house_coords = house[['latitude', 'longitude']].to_numpy()
    school_coords = primary_schools[['latitude', 'longitude']].to_numpy()

    # Create an array to store the closest school indices for each house
    closest_school_indices = []

    # Loop over each house and show progress
    total_houses = len(house_coords)
    last_percent_shown = 0  # Track the last percentage shown
    
    for i, house_coord in enumerate(house_coords):
        # Calculate the geodesic distance from this house to all schools
        distances = np.array([geodesic(house_coord, school_coord).kilometers for school_coord in school_coords])
        
        # Find the index of the closest school
        closest_school_index = np.argmin(distances)
        
        # Append the closest school index to the list
        closest_school_indices.append(closest_school_index)

        # Calculate progress percentage
        progress = (i + 1) / total_houses * 100
        
        # Only print when the progress reaches a new integer percentage
        if int(progress) > last_percent_shown:
            print(f"Progress: {int(progress)}%")
            last_percent_shown = int(progress)

    # Get the corresponding closest school names based on the indices
    closest_primary_schools = primary_schools.iloc[closest_school_indices]['School_No'].values
    
    return closest_primary_schools

# Assign the closest primary school to each house
house['closest_primary_school'] = find_closest_primary_school_with_progress(house, school)


## Car route distance to closest primary school

### fist 1000?

In [None]:
import time
import openrouteservice
import numpy as np

# Set your ORS API key here
api_key = '5b3ce3597851110001cf624831f5f7b8900e4eeab9d54353300aab90'
client = openrouteservice.Client(key=api_key)

# Function to calculate the car route distance with progress tracking and rate limit handling
def calculate_route_distance_for_first_1000(house, school, api_limit=1940, batch_size=40):
    # Limit the total houses to 1000
    total_houses = min(1000, len(house))
    
    # List to store the distances
    distances = [None] * total_houses  # Initialize with None for the first 1000 houses
    
    processed_houses = 0  # Track how many houses we've processed
    progress_checkpoint = 1  # Set the initial checkpoint for 1% progress

    # Calculate how many batches we can process
    remaining_requests = api_limit
    while processed_houses < total_houses and remaining_requests > 0:
        # Determine how many houses to process in this batch
        batch_houses = min(batch_size, total_houses - processed_houses)
        
        for i in range(batch_houses):
            house_idx = processed_houses + i
            # Get the coordinates of the house
            house_coords = (house.iloc[house_idx]['longitude'], house.iloc[house_idx]['latitude'])  # ORS expects (lon, lat)
            
            # Get the school ID
            closest_school_id = house.iloc[house_idx]['closest_primary_school']
            
            # Get the coordinates of the closest school
            school_coords = school[school['School_No'] == closest_school_id][['longitude', 'latitude']].values[0]
            
            # Get the route distance using ORS API
            try:
                route = client.directions(
                    coordinates=[house_coords, tuple(school_coords)],
                    profile='driving-car',
                    format='geojson'
                )
                # Extract the distance (in meters) from the route
                distance_meters = route['features'][0]['properties']['segments'][0]['distance']
                distances[house_idx] = distance_meters / 1000  # Convert to kilometers
            except Exception as e:
                print(f"Error calculating distance for house {house_idx}: {e}")
            
            # Calculate and print progress every 1%
            progress = (processed_houses + i + 1) / total_houses * 100
            if progress >= progress_checkpoint:
                print(f"Progress: {int(progress_checkpoint)}% completed")
                progress_checkpoint += 1  # Update to the next 1% checkpoint

        # Update the number of processed houses
        processed_houses += batch_houses
        remaining_requests -= batch_houses
        
        # If more houses are left and there are still API requests available, wait for the next batch
        if processed_houses < total_houses and remaining_requests > 0:
            print(f"Waiting for 1 minute to comply with the API rate limit...")
            time.sleep(60)  # Wait for 1 minute to respect the 40 requests/minute rate limit
    
    return distances

# Initialize the 'car_route_distance_km' column with None for the first 1000 rows
house['car_route_distance_km'] = None

# Calculate the distances for the first 1000 houses
calculated_distances = calculate_route_distance_for_first_1000(house, school)

# Assign the calculated distances to the first 1000 rows in the DataFrame
house.loc[:999, 'car_route_distance_km'] = calculated_distances

### next 500

In [None]:
# Set your ORS API key here
api_key = '5b3ce3597851110001cf624831f5f7b8900e4eeab9d54353300aab90'
client = openrouteservice.Client(key=api_key)

# Function to calculate the car route distance for the next 500 rows with rate limit handling
def calculate_route_distance_for_next_500(house, school, start_index=1000, end_index=1500, api_limit=1000, batch_size=40):
    # Limit the total houses to 500
    total_houses = min(end_index - start_index, len(house) - start_index)
    
    # List to store the distances
    distances = [None] * total_houses  # Initialize with None for the next 500 houses
    
    processed_houses = 0  # Track how many houses we've processed
    progress_checkpoint = 1  # Set the initial checkpoint for 1% progress

    # Calculate how many batches we can process
    remaining_requests = api_limit
    while processed_houses < total_houses and remaining_requests > 0:
        # Determine how many houses to process in this batch
        batch_houses = min(batch_size, total_houses - processed_houses)
        
        for i in range(batch_houses):
            house_idx = start_index + processed_houses + i
            # Get the coordinates of the house
            house_coords = (house.iloc[house_idx]['longitude'], house.iloc[house_idx]['latitude'])  # ORS expects (lon, lat)
            
            # Get the school ID
            closest_school_id = house.iloc[house_idx]['closest_primary_school']
            
            # Get the coordinates of the closest school
            school_coords = school[school['School_No'] == closest_school_id][['longitude', 'latitude']].values[0]
            
            # Get the route distance using ORS API
            try:
                route = client.directions(
                    coordinates=[house_coords, tuple(school_coords)],
                    profile='driving-car',
                    format='geojson'
                )
                # Extract the distance (in meters) from the route
                distance_meters = route['features'][0]['properties']['segments'][0]['distance']
                distances[processed_houses + i] = distance_meters / 1000  # Convert to kilometers
            except Exception as e:
                print(f"Error calculating distance for house {house_idx}: {e}")
            
            # Calculate and print progress every 1%
            progress = (processed_houses + i + 1) / total_houses * 100
            if progress >= progress_checkpoint:
                print(f"Progress: {int(progress_checkpoint)}% completed")
                progress_checkpoint += 1  # Update to the next 1% checkpoint

        # Update the number of processed houses
        processed_houses += batch_houses
        remaining_requests -= batch_houses
        
        # If more houses are left and there are still API requests available, wait for the next batch
        if processed_houses < total_houses and remaining_requests > 0:
            print(f"Waiting for 1 minute to comply with the API rate limit...")
            time.sleep(60)  # Wait for 1 minute to respect the 40 requests/minute rate limit
    
    return distances

# Calculate the distances for the next 500 houses (from index 1000 to 1500)
calculated_distances = calculate_route_distance_for_next_500(house, school, start_index=1000, end_index=1500)

# Assign the calculated distances to the corresponding rows in the DataFrame
house.loc[1000:1499, 'car_route_distance_km'] = calculated_distances

### Next 1000

In [None]:
# Set your ORS API key here
api_key = '5b3ce3597851110001cf6248a94ffeeb1e004c8aa25de3362e1fbb3a'
client = openrouteservice.Client(key=api_key)

# Function to calculate the car route distance for the next 1000 rows with rate limit handling
def calculate_route_distance_for_next_1000(house, school, start_index=2000, end_index=3000, api_limit=1500, batch_size=40):
    # Limit the total houses to 1000
    total_houses = min(end_index - start_index, len(house) - start_index)
    
    # List to store the distances
    distances = [None] * total_houses  # Initialize with None for the next 1000 houses
    
    processed_houses = 0  # Track how many houses we've processed
    progress_checkpoint = 1  # Set the initial checkpoint for 1% progress

    # Calculate how many batches we can process
    remaining_requests = api_limit
    while processed_houses < total_houses and remaining_requests > 0:
        # Determine how many houses to process in this batch
        batch_houses = min(batch_size, total_houses - processed_houses)
        
        for i in range(batch_houses):
            house_idx = start_index + processed_houses + i
            # Get the coordinates of the house
            house_coords = (house.iloc[house_idx]['longitude'], house.iloc[house_idx]['latitude'])  # ORS expects (lon, lat)
            
            # Get the school ID
            closest_school_id = house.iloc[house_idx]['closest_primary_school']
            
            # Get the coordinates of the closest school
            school_coords = school[school['School_No'] == closest_school_id][['longitude', 'latitude']].values[0]
            
            # Get the route distance using ORS API
            try:
                route = client.directions(
                    coordinates=[house_coords, tuple(school_coords)],
                    profile='driving-car',
                    format='geojson'
                )
                # Extract the distance (in meters) from the route
                distance_meters = route['features'][0]['properties']['segments'][0]['distance']
                distances[processed_houses + i] = distance_meters / 1000  # Convert to kilometers
            except Exception as e:
                print(f"Error calculating distance for house {house_idx}: {e}")
            
            # Calculate and print progress every 1%
            progress = (processed_houses + i + 1) / total_houses * 100
            if progress >= progress_checkpoint:
                print(f"Progress: {int(progress_checkpoint)}% completed")
                progress_checkpoint += 1  # Update to the next 1% checkpoint

        # Update the number of processed houses
        processed_houses += batch_houses
        remaining_requests -= batch_houses
        
        # If more houses are left and there are still API requests available, wait for the next batch
        if processed_houses < total_houses and remaining_requests > 0:
            print(f"Waiting for 1 minute to comply with the API rate limit...")
            time.sleep(60)  # Wait for 1 minute to respect the 40 requests/minute rate limit
    
    return distances

# Calculate the distances for the next 1000 houses (from index 2000 to 3000)
calculated_distances = calculate_route_distance_for_next_1000(house, school, start_index=2000, end_index=3000)

# Assign the calculated distances to the corresponding rows in the DataFrame
house.loc[2000:2999, 'car_route_distance_km'] = calculated_distances

In [None]:
# Save the DataFrame to a CSV file after processing each batch in data folder
house.to_csv('../data/curated/house_primary_school.csv', index=False)

In [None]:
# Set your updated ORS API key here
api_key = '5b3ce3597851110001cf6248a94ffeeb1e004c8aa25de3362e1fbb3a'
client = openrouteservice.Client(key=api_key)

# Load the DataFrame when resuming
house = pd.read_csv('../data/curated/house_primary_school.csv')


# Check which rows already have calculated distances
# Start processing from the first row where the 'car_route_distance_km' is still None
next_start_index = house[house['car_route_distance_km'].isna()].index.min()

# Define the next end index for processing (e.g., process in batches of 500 or 1000)
next_end_index = min(next_start_index + 500, len(house))

# Call the function to calculate the next batch of distances
calculated_distances = calculate_route_distance_for_next_500(house, school, start_index=next_start_index, end_index=next_end_index)

# Assign the calculated distances to the corresponding rows in the DataFrame
house.loc[next_start_index:next_end_index - 1, 'car_route_distance_km'] = calculated_distances

# Save the updated DataFrame again after processing
house.to_csv('../data/curated/house_primary_school.csv', index=False)

In [None]:
# drop all rows with latitude and longitude values of 0
house = house[(house['latitude'] != 0) & (house['longitude'] != 0)]
# save the updated DataFrame
house.to_csv('../data/curated/house_primary_school.csv', index=False)  

In [None]:
# Load the DataFrame with the real estate data
house = pd.read_csv('../data/curated/house_primary_school.csv')

## Linear distance to closest independent school

In [8]:
# check the number of non-government secondary schools in the data
school['Education_Sector'].value_counts(normalize=True) * 100

Education_Sector
Government     63.339552
Catholic       24.626866
Independent    12.033582
Name: proportion, dtype: float64

In [None]:
def find_closest_catholic_independent_school_with_progress(house, school):
    # Filter for Catholic and Independent secondary schools
    filtered_schools = school[school['Education_Sector'] == 'Independent'].reset_index(drop=True)
    
    # Convert lat/lon data to NumPy arrays for fast vectorized computation
    house_coords = house[['latitude', 'longitude']].to_numpy()
    school_coords = filtered_schools[['latitude', 'longitude']].to_numpy()

    # Create arrays to store the closest school distances and indices for each house
    closest_school_distances = []
    closest_school_indices = []

    # Loop over each house and show progress
    total_houses = len(house_coords)
    last_percent_shown = 0  # Track the last percentage shown
    
    for i, house_coord in enumerate(house_coords):
        # Calculate the geodesic distance from this house to all schools
        distances = np.array([geodesic(house_coord, school_coord).kilometers for school_coord in school_coords])
        
        # Find the index of the closest school
        closest_school_index = np.argmin(distances)
        closest_distance = distances[closest_school_index]
        
        # Append the closest school index and distance to the lists
        closest_school_indices.append(closest_school_index)
        closest_school_distances.append(closest_distance)

        # Calculate progress percentage
        progress = (i + 1) / total_houses * 100
        
        # Only print when the progress reaches a new integer percentage
        if int(progress) > last_percent_shown:
            print(f"Progress: {int(progress)}%")
            last_percent_shown = int(progress)

    # Get the corresponding closest school names based on the indices
    closest_school_names = filtered_schools.iloc[closest_school_indices]['School_No'].values

    # Return a DataFrame with both the closest school name and the distance
    return pd.DataFrame({
        'closest_school': closest_school_names,
        'distance_to_closest_school': closest_school_distances
    })

# Calculate the closest Catholic or Independent secondary school for each house
closest_schools_df = find_closest_catholic_independent_school_with_progress(house, school)

# Add the results to the house DataFrame
house['closest_independent_school'] = closest_schools_df['closest_school']
house['distance_to_closest_independent_school'] = closest_schools_df['distance_to_closest_school']


In [None]:
# Check correlation between the distance to the closest primary school and the extracted price 
house[['distance_to_closest_primary_school_km', 'extracted_price']].corr()

In [None]:
# check correlation between the distance to the closest indepdent school and the extracted price
house[['distance_to_closest_independent_school', 'extracted_price']].corr()