In [277]:
import json
import pandas as pd
import os
import numpy as np


# Read Files

In [278]:
def read_json_file(json_file):
    directory = "../data/landing/osm_data/"
    file_path = os.path.join(directory, json_file)
    with open(file_path, 'r') as file:
        data = json.load(file)

    return data

In [279]:
school2_json_data = read_json_file("school2_data.json")
shop_json_data = read_json_file("shop_data.json")
park_json_data = read_json_file("park_data.json")
hospital_json_data = read_json_file("hospital_data.json")
entertainments_json_data = read_json_file("entertainments_data.json")
psf_json_data = read_json_file("public_safety_facilities_data.json")

In [280]:
school1_csv_data = pd.read_csv('../data/landing//osm_data/school_data.csv')

# Preprocessing

### Nodes and Ways Management

In [281]:
def calculate_centroid(node_ids, nodes_df):
    nodes_subset = nodes_df[nodes_df['id'].isin(node_ids)]
    
    centroid_lat = nodes_subset['lat'].mean()
    centroid_lon = nodes_subset['lon'].mean()
    
    return centroid_lat, centroid_lon


In [282]:
def process_osm_data(json_data):

    elements_data = json_data['elements']
    df = pd.DataFrame(elements_data)

    nodes_df = df[df['type'] == 'node']
    ways_df = df[df['type'] == 'way']

    ways_df.loc[:, 'lat'], ways_df.loc[:, 'lon'] = zip(*ways_df['nodes'].apply(lambda nodes: calculate_centroid(nodes, nodes_df)))

    nodes_in_ways = ways_df['nodes'].explode().unique()
    nodes_df_filtered = nodes_df[~nodes_df['id'].isin(nodes_in_ways)]

    combined_df = pd.concat([nodes_df_filtered, ways_df], ignore_index=True, sort=False)

    return combined_df


In [283]:
osm_data = {
    'school1': school1_csv_data,
    'school2': process_osm_data(school2_json_data),
    'shop': process_osm_data(shop_json_data),
    'park': process_osm_data(park_json_data),
    'hospital': process_osm_data(hospital_json_data),
    'entertainments': process_osm_data(entertainments_json_data),
    'psf': process_osm_data(psf_json_data)

}

### Data Inspection and Cleaning

In [284]:
features_summary = pd.DataFrame(columns=['DataFrame', '# instances', 'Features'])
for name, df in osm_data.items():
    features = ', '.join([f"{col}({str(dtype)})" for col, dtype in df.dtypes.items()])
    num_instance = len(df)
    features_summary = pd.concat([features_summary, pd.DataFrame({'DataFrame': [name],  '# instances': [num_instance], 'Features': [features]})], ignore_index=True)


In [285]:
features_summary

Unnamed: 0,DataFrame,# instances,Features
0,school1,2302,"Education_Sector(object), Entity_Type(int64), School_No(int64), School_Name(object), School_Type(object), School_Status(object), Address_Line_1(object), Address_Line_2(object), Address_Town(object), Address_State(object), Address_Postcode(int64), Postal_Address_Line_1(object), Postal_Address_Line_2(object), Postal_Town(object), Postal_State(object), Postal_Postcode(int64), Full_Phone_No(object), LGA_ID(int64), LGA_Name(object), X(float64), Y(float64)"
1,school2,1112,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"
2,shop,1756,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"
3,park,11768,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"
4,hospital,285,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"
5,entertainments,5252,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"
6,psf,1257,"type(object), id(int64), lat(float64), lon(float64), tags(object), nodes(object)"


In [286]:
school1_null_summary = pd.DataFrame(columns=['Column', 'Null Count'])
school1_null_values = osm_data['school1'].isnull().sum()

for column, null_count in school1_null_values.items():
    school1_null_summary = pd.concat([school1_null_summary, 
                                pd.DataFrame({
                                            'Column': [column], 
                                            'Null Count': [null_count]})], 
                                ignore_index=True)
    
school1_null_summary

Unnamed: 0,Column,Null Count
0,Education_Sector,0
1,Entity_Type,0
2,School_No,0
3,School_Name,0
4,School_Type,0
5,School_Status,0
6,Address_Line_1,0
7,Address_Line_2,2291
8,Address_Town,0
9,Address_State,0


In [287]:
osm_data['school1'] = osm_data['school1'].dropna(subset=['X', 'Y'])


In [288]:
standard_columns = osm_data['school2'].columns.tolist()

null_summary = pd.DataFrame(columns=standard_columns)

for name, df in osm_data.items():
    if name == 'school1':
        continue 
    null_values = df.isnull().sum() 
    null_summary.loc[name] = null_values.values

null_summary

Unnamed: 0,type,id,lat,lon,tags,nodes
school2,0,0,0,0,0,327
shop,0,0,0,0,0,970
park,0,0,0,0,0,73
hospital,0,0,0,0,0,47
entertainments,0,0,0,0,0,4125
psf,0,0,0,0,0,383


In [289]:
def extract_tags_columns(df, amenity_key='amenity', name_key='name'):
    df['amenity'] = df['tags'].apply(lambda x: x.get(amenity_key) if isinstance(x, dict) else None)
    df['name'] = df['tags'].apply(lambda x: x.get(name_key) if isinstance(x, dict) else None)
    return df

In [290]:
columns_to_drop = ['type', 'tags', 'nodes']

In [291]:
osm_data['shop'] = extract_tags_columns(osm_data['shop'], amenity_key='shop')
osm_data['shop'].drop(columns=columns_to_drop, inplace=True, errors='ignore') 
osm_data['park'] = extract_tags_columns(osm_data['park'], amenity_key='leisure')
osm_data['park'].drop(columns=columns_to_drop, inplace=True, errors='ignore') 

In [292]:
for name, df in osm_data.items():
    if name == 'school1' or name == 'park' or  name == 'shop':
        continue 
    osm_data[name] = extract_tags_columns(df)
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore') 

In [293]:
osm_data['school1'] = osm_data['school1'][['School_No', 'Y', 'X', 'School_Type', 'School_Name']].rename(columns={
    'School_Name': 'name',
    'School_No': 'id',
    'School_Type': 'amenity',
    'X': 'lon',
    'Y': 'lat'
})

In [294]:
osm_data['school2']

Unnamed: 0,id,lat,lon,amenity,name
0,148544339,-37.874197,145.148782,kindergarten,Syndal Pre-School
1,191834621,-37.869111,145.164286,kindergarten,Tally Ho Preschool
2,207718805,-37.897999,145.113743,kindergarten,St Johns Pre-School
3,246969693,-37.931641,145.200097,kindergarten,Waverley Foothills Preschool
4,247169615,-37.772253,144.966290,kindergarten,Brunswick Crèche & Day Nursery
...,...,...,...,...,...
1107,1308989656,-34.234051,142.164272,kindergarten,Shine Brighty Kindergarten Irymple
1108,1310990420,-37.922863,145.166057,kindergarten,Wellington Preschool
1109,1312032451,-37.953506,145.176856,kindergarten,Harrisfield Kindergarten
1110,1312937142,-37.559339,149.760336,kindergarten,Mallacoota Pre-School Kindergarten


In [295]:
osm_data['school1']

Unnamed: 0,id,lat,lon,amenity,name
0,1,-38.61771,146.66660,Primary,Alberton Primary School
1,3,-38.38628,142.59039,Primary,Allansford and District Primary School
2,4,-37.08450,143.47565,Primary,Avoca Primary School
3,8,-36.90137,145.23472,Primary,Avenel Primary School
4,12,-37.74268,145.21398,Primary,Warrandyte Primary School
...,...,...,...,...,...
2297,2244,-37.64875,145.08148,Secondary,Plenty River College
2298,2245,-37.53046,144.90520,Primary,Holy Cross Catholic Primary School
2299,2246,-37.97324,145.31589,Primary,Sidrah Gardens School
2300,2247,-37.88319,145.29327,Secondary,Mountain District Community College


In [299]:
standard_columns = osm_data['school2'].columns.tolist()

null_summary = pd.DataFrame(columns=standard_columns)

for name, df in osm_data.items():
    null_values = df.isnull().sum() 
    null_summary.loc[name] = null_values.values

null_summary

Unnamed: 0,id,lat,lon,amenity,name
school1,0,0,0,0,0
school2,0,0,0,0,168
shop,0,0,0,0,60
park,0,0,0,0,5809
hospital,0,0,0,0,3
entertainments,0,0,0,0,196
psf,0,0,0,0,80


In [None]:
data_summary 

# Save Files

In [297]:
base_path = '../data/raw'
osm_data_folder = os.path.join(base_path, 'osm_data')

if not os.path.exists(osm_data_folder):
    os.makedirs(osm_data_folder)


In [298]:
for name, df in osm_data.items():
    file_path = os.path.join(osm_data_folder, f'{name}.csv')  
    df.to_csv(file_path, index=False) 
    print(f"{name} files have been saved to {file_path}")



school1 files have been saved to ../data/raw/osm_data/school1.csv
school2 files have been saved to ../data/raw/osm_data/school2.csv
shop files have been saved to ../data/raw/osm_data/shop.csv
park files have been saved to ../data/raw/osm_data/park.csv
hospital files have been saved to ../data/raw/osm_data/hospital.csv
entertainments files have been saved to ../data/raw/osm_data/entertainments.csv
psf files have been saved to ../data/raw/osm_data/psf.csv
