Find proximity to school using Openroute Service API

In [13]:
import pandas as pd
import requests
from geopy.distance import geodesic
from pathlib import Path
import time
import logging
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Configure logging
logging.basicConfig(filename='process_log.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the property and school data
school_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'school_location_cleaned_2023.csv')
property_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'rent_cleaned.csv')

# Drop rows with null entries in school_df and property_df
school_df = school_df.dropna(subset=['latitude', 'longitude'])
property_df = property_df.dropna(subset=['latitude', 'longitude'])

# Split property_df into 5 partitions, each with 600 entries (last one may have less)
partitions = [property_df[i:i+600] for i in range(0, len(property_df), 600)]

# Define 5 different API keys
api_keys = [
    '5b3ce3597851110001cf62484aef543442ec490dbaf2a41a693499c4',
    '5b3ce3597851110001cf624892918ea9d1e9424cbe679f4d70ca1324',
    '5b3ce3597851110001cf6248f21e786bb58c4a5b8aa0988c6e26ba3e',
    '5b3ce3597851110001cf62487ac82fa019bf422fa327134f28f2a197',
    '5b3ce3597851110001cf62487ec54a381ceb450388b047637d95f3bb'
]

# Define the top_3_nearest function to calculate the nearest schools
def top_3_nearest(latitude, longitude, location_df):
    def calculate_distance(row):
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            return float('inf')
        return geodesic((latitude, longitude), (row['latitude'], row['longitude'])).kilometers
    
    location_df['distance'] = location_df.apply(calculate_distance, axis=1)
    return location_df.sort_values(by='distance').head(3)

# Setup requests session with retry strategy
def create_session_with_retries():
    session = requests.Session()
    retries = Retry(total=5,  # Total retries before giving up
                    backoff_factor=1,  # Wait 1, 2, 4, 8, ... seconds between retries
                    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
                    raise_on_status=False)  # Don't raise on error status codes
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

# Function to process one partition of properties with progress tracking
def process_partition(property_df_partition, api_key, partition_index, session):
    property_df_partition['time_to_nearest_school(min)'] = None
    property_df_partition['distance_to_nearest_school(km)'] = None
    
    total_rows = len(property_df_partition)  # Total number of rows to process
    for local_index, (index, row) in enumerate(property_df_partition.iterrows(), start=1):  # local_index starts from 1
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            logging.warning(f"Skipping row {index} in partition {partition_index} due to missing latitude or longitude")
            continue
        
        top_schools = top_3_nearest(row['latitude'], row['longitude'], school_df)
        
        min_duration = float('inf')
        min_distance = float('inf')
        
        for _, school in top_schools.iterrows():
            start = f"{row['longitude']},{row['latitude']}"
            end = f"{school['longitude']},{school['latitude']}"
            api_url = f"https://api.openrouteservice.org/v2/directions/driving-car?api_key={api_key}&start={start}&end={end}"
            
            try:
                response = session.get(api_url, headers={'Accept': 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8'})
                response.raise_for_status()
                data = response.json()
                travel_info = data['features'][0]['properties']['segments'][0]
                duration = travel_info['duration'] / 60  # Convert from seconds to minutes
                distance = travel_info['distance'] / 1000  # Convert from meters to kilometers

                if duration < min_duration:
                    min_duration = duration
                    min_distance = distance
            except requests.exceptions.RequestException as e:
                logging.error(f"Error on row {index} in partition {partition_index}: {e}")
                continue
            
            time.sleep(1.51)
        
        # Record the nearest travel time and distance
        property_df_partition.at[index, 'time_to_nearest_school(min)'] = min_duration
        property_df_partition.at[index, 'distance_to_nearest_school(km)'] = min_distance
        
        # Progress tracking: Show the current entry out of total for this partition
        print(f"Partition {partition_index}: Processing property {local_index}/{total_rows}", end='\r')

    return property_df_partition

# Process each partition with a different API key and track progress
final_df = pd.DataFrame()
session = create_session_with_retries()  # Create a session with retries

for i, partition in enumerate(partitions):
    logging.info(f"Processing partition {i+1} with API key {i+1}")
    processed_partition = process_partition(partition.copy(), api_keys[i], i+1, session)
    final_df = pd.concat([final_df, processed_partition])
    logging.info(f"Finished processing partition {i+1}")

# Save the final DataFrame to a CSV file
final_df.to_csv(Path.cwd().parent / 'data' / 'curated' / 'property_school.csv', index=False)

logging.info("Processing complete and data saved to property_school.csv")
print("Processing proximity complete and data saved to property_school.csv")


Processing proximity complete and data saved to property_school.csv


In [11]:
print(len(partitions[4]))

562


Find the proximity to train station using open route service API

In [16]:
import pandas as pd
import requests
from geopy.distance import geodesic
from pathlib import Path
import time
import logging
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Configure logging
logging.basicConfig(filename='process_log_train.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the property and train station data
train_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'train_station_cleaned.csv')
property_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'rent_cleaned.csv')

# Drop rows with null entries in train_df and property_df
train_df = train_df.dropna(subset=['latitude', 'longitude'])
property_df = property_df.dropna(subset=['latitude', 'longitude'])

# Split property_df into 5 partitions, each with 600 entries (last one may have less)
partitions = [property_df[i:i+600] for i in range(0, len(property_df), 600)]

# Define 5 different API keys
api_keys = [
    '5b3ce3597851110001cf6248b904653ba4ac4ed78f4ae7902e93a788',
    '5b3ce3597851110001cf6248d679d4da97464cd5a0b5e369e3997ac2',
    '5b3ce3597851110001cf6248afe6001af46b4b2d88a3918e49a74472',
    '5b3ce3597851110001cf624851695d82a54b4cf583c8812c22d0bda5',
    '5b3ce3597851110001cf62485b36bdcc87234532a9de9035fd611135'
]

# Define the top_3_nearest function to calculate the nearest train stations
def top_3_nearest(latitude, longitude, location_df):
    def calculate_distance(row):
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            return float('inf')
        return geodesic((latitude, longitude), (row['latitude'], row['longitude'])).kilometers
    
    location_df['distance'] = location_df.apply(calculate_distance, axis=1)
    return location_df.sort_values(by='distance').head(3)

# Setup requests session with retry strategy
def create_session_with_retries():
    session = requests.Session()
    retries = Retry(total=5,  # Total retries before giving up
                    backoff_factor=1,  # Wait 1, 2, 4, 8, ... seconds between retries
                    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
                    raise_on_status=False)  # Don't raise on error status codes
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

# Function to process one partition of properties with progress tracking
def process_partition(property_df_partition, api_key, partition_index, session):
    property_df_partition['time_to_nearest_train(min)'] = None
    property_df_partition['distance_to_nearest_train(km)'] = None
    
    total_rows = len(property_df_partition)  # Total number of rows to process
    for local_index, (index, row) in enumerate(property_df_partition.iterrows(), start=1):  # local_index starts from 1
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            logging.warning(f"Skipping row {index} in partition {partition_index} due to missing latitude or longitude")
            continue
        
        top_trains = top_3_nearest(row['latitude'], row['longitude'], train_df)
        
        min_duration = float('inf')
        min_distance = float('inf')
        
        for _, train in top_trains.iterrows():
            start = f"{row['longitude']},{row['latitude']}"
            end = f"{train['longitude']},{train['latitude']}"
            api_url = f"https://api.openrouteservice.org/v2/directions/driving-car?api_key={api_key}&start={start}&end={end}"
            
            try:
                response = session.get(api_url, headers={'Accept': 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8'})
                response.raise_for_status()
                data = response.json()
                travel_info = data['features'][0]['properties']['segments'][0]
                duration = travel_info['duration'] / 60  # Convert from seconds to minutes
                distance = travel_info['distance'] / 1000  # Convert from meters to kilometers

                if duration < min_duration:
                    min_duration = duration
                    min_distance = distance
            except requests.exceptions.RequestException as e:
                logging.error(f"Error on row {index} in partition {partition_index}: {e}")
                continue
            
            time.sleep(1.51)
        
        # Record the nearest travel time and distance
        property_df_partition.at[index, 'time_to_nearest_train(min)'] = min_duration
        property_df_partition.at[index, 'distance_to_nearest_train(km)'] = min_distance
        
        # Progress tracking: Show the current entry out of total for this partition
        print(f"Partition {partition_index}: Processing property {local_index}/{total_rows}", end='\r')

    return property_df_partition

# Process each partition with a different API key and track progress
final_df = pd.DataFrame()
session = create_session_with_retries()  # Create a session with retries

for i, partition in enumerate(partitions):
    logging.info(f"Processing partition {i+1} with API key {i+1}")
    processed_partition = process_partition(partition.copy(), api_keys[i], i+1, session)
    final_df = pd.concat([final_df, processed_partition])
    logging.info(f"Finished processing partition {i+1}")

# Save the final DataFrame to a CSV file
final_df.to_csv(Path.cwd().parent / 'data' / 'curated' / 'property_train.csv', index=False)

logging.info("Processing complete and data saved to property_train.csv")
print("Processing proximity complete and data saved to property_train.csv")


Processing proximity complete and data saved to property_train.csv


In [17]:
import pandas as pd
from pathlib import Path

# Load the property_school and property_train data
property_school_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'property_school.csv')
property_train_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'property_train.csv')

# Extract 'distance_to_nearest_school(km)' and 'time_to_nearest_school(min)' columns from property_school_df
school_columns = property_school_df[['distance_to_nearest_school(km)', 'time_to_nearest_school(min)']]

# Rename the 'time_to_nearest_train(min)' and 'distance_to_nearest_train(km)' columns in the property_train dataframe
property_train_df.rename(columns={
    'time_to_nearest_train(min)': 'time_to_nearest_train_station(min)',
    'distance_to_nearest_train(km)': 'distance_to_nearest_train_station(km)'
}, inplace=True)

# Merge the school columns into the property_train dataframe
merged_df = pd.concat([property_train_df, school_columns], axis=1)

# Save the merged dataframe to a new CSV file
merged_df_path = Path.cwd().parent / 'data' / 'curated'/'rent_with_proximity.csv'
merged_df.to_csv(merged_df_path, index=False)

print(f"Merged data with renamed columns saved to {merged_df_path}")

Merged data with renamed columns saved to c:\Users\29557\Documents\GitHub\project-2-group-real-estate-industry-project-7\data\curated\rent_with_proximity.csv


In [20]:
import pandas as pd
from pathlib import Path

# Load the property_school and property_train data
property_school_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'property_school.csv')
property_train_df = pd.read_csv(Path.cwd().parent / 'data' / 'curated' / 'property_train.csv')

# Extract 'distance_to_nearest_school(km)' and 'time_to_nearest_school(min)' columns from property_school_df
school_columns = property_school_df[['distance_to_nearest_school(km)', 'time_to_nearest_school(min)']]

# Rename the 'time_to_nearest_train(min)' and 'distance_to_nearest_train(km)' columns in the property_train dataframe
property_train_df.rename(columns={
    'time_to_nearest_train(min)': 'time_to_nearest_train_station(min)',
    'distance_to_nearest_train(km)': 'distance_to_nearest_train_station(km)'
}, inplace=True)

# Merge the school columns into the property_train dataframe
merged_df = pd.concat([property_train_df, school_columns], axis=1)

# Round the distance and time columns to 2 decimal places
merged_df['distance_to_nearest_school(km)'] = merged_df['distance_to_nearest_school(km)'].round(2)
merged_df['time_to_nearest_school(min)'] = merged_df['time_to_nearest_school(min)'].round(2)
merged_df['distance_to_nearest_train_station(km)'] = merged_df['distance_to_nearest_train_station(km)'].round(2)
merged_df['time_to_nearest_train_station(min)'] = merged_df['time_to_nearest_train_station(min)'].round(2)

# Save the final dataframe to a new CSV file
merged_df_path = Path.cwd().parent / 'data' / 'curated'/'rent_with_proximity.csv'
merged_df.to_csv(merged_df_path, index=False)

print(f"Merged data with rounded values saved to {merged_df_path}")


Merged data with rounded values saved to c:\Users\29557\Documents\GitHub\project-2-group-real-estate-industry-project-7\data\curated\rent_with_proximity.csv
