In [1]:
import json
import pandas as pd
import os
import numpy as np
import csv
import pandas as pd
import numpy as np
from openrouteservice import Client
from sklearn.neighbors import BallTree
import time
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Read Files

In [2]:
def read_json_file(json_file):
    directory = "../../data/landing/osm_data/"
    file_path = os.path.join(directory, json_file)
    with open(file_path, 'r') as file:
        data = json.load(file)

    return data

In [3]:
school2_json_data = read_json_file("school2_data.json")
shop_json_data = read_json_file("shop_data.json")
park_json_data = read_json_file("park_data.json")
hospital_json_data = read_json_file("hospital_data.json")
entertainments_json_data = read_json_file("entertainments_data.json")
psf_json_data = read_json_file("public_safety_facilities_data.json")
school1_csv_data = pd.read_csv('../../data/landing/osm_data/school_data.csv')

# External Data Preprocessing

### Nodes and Ways Management

In [4]:
def calculate_centroid(node_ids, nodes_df):
    nodes_subset = nodes_df[nodes_df['id'].isin(node_ids)]
    
    centroid_lat = nodes_subset['lat'].mean()
    centroid_lon = nodes_subset['lon'].mean()
    
    return centroid_lat, centroid_lon

In [5]:
def process_osm_data(json_data):

    elements_data = json_data['elements']
    df = pd.DataFrame(elements_data)

    nodes_df = df[df['type'] == 'node']
    ways_df = df[df['type'] == 'way']

    ways_df.loc[:, 'lat'], ways_df.loc[:, 'lon'] = zip(*ways_df['nodes'].apply(lambda nodes: calculate_centroid(nodes, nodes_df)))

    nodes_in_ways = ways_df['nodes'].explode().unique()
    nodes_df_filtered = nodes_df[~nodes_df['id'].isin(nodes_in_ways)]

    combined_df = pd.concat([nodes_df_filtered, ways_df], ignore_index=True, sort=False)

    return combined_df

In [6]:
osm_data = {
    'school1': school1_csv_data,
    'school2': process_osm_data(school2_json_data),
    'shop': process_osm_data(shop_json_data),
    'park': process_osm_data(park_json_data),
    'hospital': process_osm_data(hospital_json_data),
    'entertainments': process_osm_data(entertainments_json_data),
    'psf': process_osm_data(psf_json_data)

}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ways_df.loc[:, 'lat'], ways_df.loc[:, 'lon'] = zip(*ways_df['nodes'].apply(lambda nodes: calculate_centroid(nodes, nodes_df)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ways_df.loc[:, 'lat'], ways_df.loc[:, 'lon'] = zip(*ways_df['nodes'].apply(lambda nodes: calculate_centroid(nodes, nodes_df)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

### Data Inspection and Cleaning

In [7]:
def extract_tags_columns(df, amenity_key='amenity', name_key='name'):
    df['amenity'] = df['tags'].apply(lambda x: x.get(amenity_key) if isinstance(x, dict) else None)
    df['name'] = df['tags'].apply(lambda x: x.get(name_key) if isinstance(x, dict) else None)
    return df

In [8]:
features_summary = pd.DataFrame(columns=['DataFrame', '# instances', 'Features'])
for name, df in osm_data.items():
    features = ', '.join([f"{col}({str(dtype)})" for col, dtype in df.dtypes.items()])
    num_instance = len(df)
    features_summary = pd.concat([features_summary, pd.DataFrame({'DataFrame': [name],  '# instances': [num_instance], 'Features': [features]})], ignore_index=True)

In [9]:
school1_null_summary = pd.DataFrame(columns=['Column', 'Null Count'])
school1_null_values = osm_data['school1'].isnull().sum()

for column, null_count in school1_null_values.items():
    school1_null_summary = pd.concat([school1_null_summary, 
                                pd.DataFrame({
                                            'Column': [column], 
                                            'Null Count': [null_count]})], 
                                ignore_index=True)
    
school1_null_summary

Unnamed: 0,Column,Null Count
0,Education_Sector,0
1,Entity_Type,0
2,School_No,0
3,School_Name,0
4,School_Type,0
5,School_Status,0
6,Address_Line_1,0
7,Address_Line_2,2291
8,Address_Town,0
9,Address_State,0


In [10]:
osm_data['school1'] = osm_data['school1'].dropna(subset=['X', 'Y'])


In [11]:
standard_columns = osm_data['school2'].columns.tolist()

null_summary = pd.DataFrame(columns=standard_columns)

for name, df in osm_data.items():
    if name == 'school1':
        continue 
    null_values = df.isnull().sum() 
    null_summary.loc[name] = null_values.values

null_summary

Unnamed: 0,type,id,lat,lon,tags,nodes
school2,0,0,0,0,0,325
shop,0,0,0,0,0,978
park,0,0,0,0,0,73
hospital,0,0,0,0,0,45
entertainments,0,0,0,0,0,4148
psf,0,0,0,0,0,383


In [12]:
columns_to_drop = ['type', 'tags', 'nodes']

In [13]:
osm_data['shop'] = extract_tags_columns(osm_data['shop'], amenity_key='shop')
osm_data['shop'].drop(columns=columns_to_drop, inplace=True, errors='ignore') 
osm_data['park'] = extract_tags_columns(osm_data['park'], amenity_key='leisure')
osm_data['park'].drop(columns=columns_to_drop, inplace=True, errors='ignore') 

In [14]:
for name, df in osm_data.items():
    if name == 'school1' or name == 'park' or  name == 'shop':
        continue 
    osm_data[name] = extract_tags_columns(df)
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore') 

In [15]:
osm_data['school1'] = osm_data['school1'][['School_No', 'Y', 'X', 'School_Type', 'School_Name']].rename(columns={
    'School_Name': 'name',
    'School_No': 'id',
    'School_Type': 'amenity',
    'X': 'lon',
    'Y': 'lat'
})

In [16]:
# The University of Melbourne Parkville Campus
new_row = {
    'id': 22818856,
    'lat': -37.8019134,
    'lon': 144.9597002,
    'amenity': 'university',
    'name': 'The University of Melbourne, Parkville Campus'
}
new_row_df = pd.DataFrame([new_row])
osm_data['school2'] = pd.concat([osm_data['school2'], new_row_df], ignore_index=True)

In [17]:
standard_columns = osm_data['school2'].columns.tolist()

null_summary = pd.DataFrame(columns=standard_columns)

for name, df in osm_data.items():
    null_values = df.isnull().sum() 
    null_summary.loc[name] = null_values.values

null_summary

Unnamed: 0,id,lat,lon,amenity,name
school1,0,0,0,0,0
school2,0,0,0,0,168
shop,0,0,0,0,59
park,0,0,0,0,5870
hospital,0,0,0,0,4
entertainments,0,0,0,0,197
psf,0,0,0,0,80


In [18]:
instance_counts = {
    "Dataset": ["entertainments", "hospital", "park", "fire and police stations", "school", "shop"],
    "Number of Rows": [
        osm_data['entertainments'].shape[0], 
        osm_data['hospital'].shape[0], 
        osm_data['park'].shape[0], 
        osm_data['psf'].shape[0], 
        osm_data['school1'].shape[0] + osm_data['school2'].shape[0], 
        osm_data['shop'].shape[0],

    ]
}

instance_counts_df = pd.DataFrame(instance_counts)
instance_counts_df

Unnamed: 0,Dataset,Number of Rows
0,entertainments,5271
1,hospital,284
2,park,11881
3,fire and police stations,1259
4,school,3421
5,shop,1764


### Save files

In [19]:
base_path = '../../data/raw'
osm_data_folder = os.path.join(base_path, 'osm_data')

if not os.path.exists(osm_data_folder):
    os.makedirs(osm_data_folder)

In [20]:
for name, df in osm_data.items():
    file_path = os.path.join(osm_data_folder, f'{name}.csv').replace('\\', '/')
    df.to_csv(file_path, index=False) 
    print(f"{name} files have been saved to {file_path}")

school1 files have been saved to ../../data/raw/osm_data/school1.csv
school2 files have been saved to ../../data/raw/osm_data/school2.csv
shop files have been saved to ../../data/raw/osm_data/shop.csv
park files have been saved to ../../data/raw/osm_data/park.csv
hospital files have been saved to ../../data/raw/osm_data/hospital.csv


entertainments files have been saved to ../../data/raw/osm_data/entertainments.csv
psf files have been saved to ../../data/raw/osm_data/psf.csv
