In [1]:
import json
import os
import csv
import pandas as pd
import numpy as np
from openrouteservice import Client
from sklearn.neighbors import BallTree
import time
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Read Files

In [2]:
stations_df = pd.read_csv("../../data/raw/stops_data/stops_datavic_mapped.csv")
property_df = pd.read_csv("../../data/curated/properties_data3.csv")

In [3]:
top_five_university =  pd.read_csv('../../data/landing/osm_data/top_five_university_main_campus.csv')
entertainments_df = pd.read_csv('../../data/raw/osm_data/entertainments.csv')
hospital_df = pd.read_csv('../../data/raw/osm_data/hospital.csv')
park_df = pd.read_csv('../../data/raw/osm_data/park.csv')
psf_df = pd.read_csv('../../data/raw/osm_data/psf.csv')
school1_df = pd.read_csv('../../data/raw/osm_data/school1.csv')
school2_df = pd.read_csv('../../data/raw/osm_data/school2.csv')
shop_df = pd.read_csv('../../data/raw/osm_data/shop.csv')

# Preprocessing

In [4]:
print("Property DataFrame:")
print(property_df[['latitude', 'longitude']].isnull().sum())

print("\nStations DataFrame:")
print(stations_df[['stop_lat', 'stop_lon']].isnull().sum())

Property DataFrame:
latitude     0
longitude    0
dtype: int64

Stations DataFrame:
stop_lat    0
stop_lon    0
dtype: int64


In [5]:
# Convert latitude and longitude columns in both DataFrames to float for accurate calculations.
property_df['latitude'] = property_df['latitude'].astype(float)
property_df['longitude'] = property_df['longitude'].astype(float)

stations_df['stop_lat'] = stations_df['stop_lat'].astype(float)
stations_df['stop_lon'] = stations_df['stop_lon'].astype(float)

In [6]:
# Rename latitude and longitude columns in 'stations_df' and filter for rows where 'stop_type' is 'train station'.
stations_df.rename(columns={'stop_lat': 'latitude', 'stop_lon': 'longitude'}, inplace=True)
train_stations_df = stations_df[stations_df['stop_type'] == 'train station']

In [7]:
# Count and print the number of duplicate 'property_url' values in the 'property_df'
_duplicates = property_df['property_url'].duplicated(keep="first").sum()
print(f"Number of duplicate 'name' values: {_duplicates}")

Number of duplicate 'name' values: 0


## Feature Enginnering 

### Distance and Duration to the Closet Train Station

In [8]:

# client = Client(key='5b3ce3597851110001cf6248a027188a61b345eeb77761e0521405ba')
# If API key exceeds the quota, try uncomment and run the line above and comment the line below.
client = Client(key='5b3ce3597851110001cf62489c86d07e1db14afbbf6fece88bfe6afe')

In [9]:
# Convert the latitude and longitude values of train stations and properties from degrees to radians 
train_stations_coords = np.deg2rad(train_stations_df[['latitude', 'longitude']].values)
properties_coords = np.deg2rad(property_df[['latitude', 'longitude']].values)

In [10]:
# Build a BallTree using the Haversine distance metric to find the nearest train station for each property.
# Convert the distances from radians to meters, then assign the nearest station index and distance to the properties.
# Add coordinates of both the property and the nearest station, along with the station's name, to the DataFrame.

tree = BallTree(train_stations_coords, metric='haversine')
distances, indices = tree.query(properties_coords, k=1)

earth_radius = 6371000 
distances_m = distances.flatten() * earth_radius

nearest_station_indices = indices.flatten()

property_df['nearest_station_index'] = nearest_station_indices
property_df['haversine_distance'] = distances_m

property_df['property_coords'] = property_df.apply(lambda row: [row['longitude'], row['latitude']], axis=1)
property_df['station_coords'] = property_df['nearest_station_index'].apply(
    lambda idx: [train_stations_df.iloc[idx]['longitude'], train_stations_df.iloc[idx]['latitude']]
)

property_df['nearest_station_name'] = property_df['nearest_station_index'].apply(
    lambda idx: train_stations_df.iloc[idx]['stop_name']
)

In [11]:
# Convert property and station coordinates into lists of origin and destination points for further analysis.
origins = property_df['property_coords'].tolist()
destinations = property_df['station_coords'].tolist()

In [12]:
# Initialization & limitation
max_elements = 2500 
max_locations = 50  
results = []
num_properties = len(origins)
chunk_size = max_locations 

In [13]:
# Process properties in chunks, sending batches of origins and destinations to the API to get distance and duration.
# For each chunk, the distance matrix is calculated using the driving-car, and results are stored.
for i in range(0, num_properties, chunk_size):
    origin_chunk = origins[i:i+chunk_size]
    destination_chunk = destinations[i:i+chunk_size]

    if len(destination_chunk) > len(origin_chunk):
        destination_chunk = destination_chunk[:len(origin_chunk)]

    print(f"Processing properties {i} to {i + len(origin_chunk) - 1}")

    try:
        matrix = client.distance_matrix(
            locations=origin_chunk + destination_chunk,
            sources=list(range(len(origin_chunk))),
            destinations=list(range(len(origin_chunk), len(origin_chunk) + len(destination_chunk))),
            profile='driving-car',
            metrics=['distance', 'duration'],
            resolve_locations=False,
            units='m'
        )

        distances = matrix['distances']
        durations = matrix['durations']

        for j in range(len(origin_chunk)):
            distance = distances[j][j]
            duration = durations[j][j]
            results.append({
                'property_index': property_df.index[i + j],
                'route_distance_m': distance,
                'route_duration_s': duration
            })

    except Exception as e:
        print(f"Error processing batch {i} to {i + len(origin_chunk) - 1}: {e}")
        for j in range(len(origin_chunk)):
            results.append({
                'property_index': property_df.index[i + j],
                'route_distance_m': None,
                'route_duration_s': None
            })

    time.sleep(1)

Processing properties 0 to 49
Processing properties 50 to 99
Processing properties 100 to 149
Processing properties 150 to 199
Processing properties 200 to 249
Processing properties 250 to 299
Processing properties 300 to 349
Processing properties 350 to 399
Processing properties 400 to 449
Processing properties 450 to 499
Processing properties 500 to 549
Processing properties 550 to 599
Processing properties 600 to 649
Processing properties 650 to 699
Processing properties 700 to 749
Processing properties 750 to 799
Processing properties 800 to 849
Processing properties 850 to 899
Processing properties 900 to 949
Processing properties 950 to 999
Processing properties 1000 to 1049
Processing properties 1050 to 1099
Processing properties 1100 to 1149
Processing properties 1150 to 1199
Processing properties 1200 to 1249
Processing properties 1250 to 1299
Processing properties 1300 to 1349
Processing properties 1350 to 1399
Processing properties 1400 to 1449
Processing properties 1450 to 

In [14]:
df_results = pd.DataFrame(results)
property_df = property_df.merge(df_results, left_index=True, right_on='property_index')

### Distance to the Closet Hosptial/Fire Station/Police Station/University and College/Shop

In [15]:
# This function computes the nearest external location for a given set of property coordinates.
# It converts the external coordinates to radians, builds a BallTree using the Haversine metric, 
# and finds the closest match in terms of distance and name. The function returns the distances in meters 
# and the names of the nearest external locations.
def compute_nearest_distance_and_name(property_coords_rad, external_coords, external_names):
    external_coords_rad = np.deg2rad(external_coords)
    tree = BallTree(external_coords_rad, metric='haversine')
    distances, indices = tree.query(property_coords_rad, k=1)
    nearest_names = external_names[indices.flatten()]
    return distances.flatten() * earth_radius, nearest_names

In [16]:
# This function calculates the distances from each property to the five nearest external locations.
# It uses the Haversine metric in a BallTree to compute the distances in kilometers and returns both
# the distances and the names of the nearest external locations.
def compute_all_distances(property_coords_rad, external_coords_rad, external_names):
    tree = BallTree(external_coords_rad, metric='haversine')
    distances, indices = tree.query(property_coords_rad, k=5)
    distances_km = distances * earth_radius
    names = external_names[indices]
    return distances_km, names

In [17]:
police_stations_df = psf_df[psf_df['amenity'] == 'police']
fire_stations_df = psf_df[psf_df['amenity'] == 'fire_station']
university_college_df = school2_df[school2_df['amenity'] != "kindergarten" ]

In [18]:
# hospital
hospital_coords = hospital_df[['lat', 'lon']].values
hospital_names = hospital_df['name'].values
property_df['distance_to_hospital'], property_df['nearest_hospital_name'] = compute_nearest_distance_and_name(
    properties_coords, hospital_coords, hospital_names)

In [19]:
# Police Station
police_station_coords = police_stations_df[['lat', 'lon']].values
police_station_names = police_stations_df['name'].values
property_df['distance_to_police_station'], property_df['nearest_police_station_name'] = compute_nearest_distance_and_name(
    properties_coords, police_station_coords, police_station_names)

In [20]:
# Fire Station
fire_station_coords = fire_stations_df[['lat', 'lon']].values
fire_station_names = fire_stations_df['name'].values
property_df['distance_to_fire_station'], property_df['nearest_fire_station_name'] = compute_nearest_distance_and_name(
    properties_coords, fire_station_coords, fire_station_names)

In [21]:
# college/University
university_college_coords = university_college_df[['lat', 'lon']].values
university_college_names = university_college_df['name'].values
property_df['distance_to_university_college'], property_df['nearest_university_college_name'] = compute_nearest_distance_and_name(
    properties_coords, university_college_coords, university_college_names)

In [22]:
# top_five University
external_coords= np.deg2rad(top_five_university[['lat', 'lon']].values) 
external_names = top_five_university['name'].values 
distances_km, names = compute_all_distances(properties_coords, external_coords, external_names)
distance_columns = [f'distance_to_{name.replace(" ", "_").lower()}' for name in external_names]
distance_df = pd.DataFrame(distances_km, columns=distance_columns, index=property_df.index)
property_df = pd.concat([property_df, distance_df], axis=1)

In [23]:
# shop
shop_coords = shop_df[['lat', 'lon']].values
shop_names = shop_df['name'].values
property_df['distance_to_shop'], property_df['nearest_shop_name'] = compute_nearest_distance_and_name(
    properties_coords, shop_coords, shop_names)

In [24]:
property_df.loc[803, 'distance_to_shop']

98.71307387977045

### Count of Nearby Kindergartens, Schools, Restaurants, and Entertainment within Defined Radius

In [25]:
# This function calculates how many facilities are within a specified radius (in meters) of each property.
# It converts facility coordinates to radians, uses a BallTree with the Haversine metric to find facilities within the radius, 
# and adds a new column to 'property_df' with the count of nearby facilities.
def compute_facility_counts(property_df, properties_coords, facility_df, facility_name, radius=1000):

    facility_coords= np.radians(facility_df[['lat', 'lon']].values)
    tree = BallTree(facility_coords, metric='haversine')
    indices = tree.query_radius(properties_coords, r=radius/6371000) 

    counts = [len(ind) for ind in indices]
    property_df[f'{facility_name}_count_within_{radius}m'] = counts
    
    return property_df

In [26]:
kindergarten_df = school2_df[school2_df['amenity'] == "kindergarten" ]
restaurant_bar_df = entertainments_df[entertainments_df['amenity'].isin(['restaurant', 'bar'])]
cinema_theatre_df = entertainments_df[entertainments_df['amenity'].isin(['cinema', 'theatre'])]

In [27]:
# kindergarten
properties_df = compute_facility_counts(property_df, properties_coords, kindergarten_df, 'kindergarten', radius=1000)

# Primary and Secondary School
properties_df = compute_facility_counts(property_df, properties_coords, school1_df, 'secondary_primary_school', radius=3000)

In [28]:
# restaurant and bar
properties_df = compute_facility_counts(property_df, properties_coords, restaurant_bar_df, 'restaurant_bar', radius=1000)

# cinema and theatre
properties_df = compute_facility_counts(property_df, properties_coords, cinema_theatre_df, 'cinema_theatre', radius=3000)

# shop
properties_df = compute_facility_counts(property_df, properties_coords, shop_df, 'shop', radius=1000)

In [29]:
properties_df = compute_facility_counts(property_df, properties_coords, park_df, 'park', radius=1000)

In [30]:
properties_df.columns

Index(['property_url', 'name', 'cost_text', 'latitude', 'longitude',
       'bed_info', 'bath_info', 'parking', 'date_available', 'desc',
       'post_code', 'region', 'matched_region_x',
       'Children enrolled in a preschool or preschool program (no.)',
       'Estimated resident population (no.)', 'Land area (ha)',
       'Median monthly household mortgage payment ($)',
       'Median price of established house transfers ($)',
       'Median total income (excl. Government pensions and allowances) ($)',
       'Median weekly household rental payment ($)', 'Number of jobs',
       'Working age population (aged 15-64 years) (%)', 'matched_region_y',
       'avg_crime_count', 'region_encoded', 'property_type_House',
       'property_type_New Apartments / Off the Plan',
       'property_type_New House & Land', 'property_type_Studio',
       'property_type_Terrace', 'property_type_Townhouse',
       'property_type_Villa', 'nearest_station_index', 'haversine_distance',
       'property_c

# Save

In [31]:
properties_df = properties_df.dropna()

In [32]:
properties_df.to_csv('../../data/curated/properties_data4_new.csv', index=False)