# School and Hospital external dataset
# This notebook aims to webscrapying the hospital and school data for every suburb based on the SA2_code 

API_keys:AIzaSyCN_HvPAmTVnCRkeJybUm7wee9YvTYWCcs

In [None]:
import requests
import pandas as pd
import time
import os

# Google Maps API
API_KEY = 'AIzaSyCmU1epyXmI4mawecIz1qU7he_0VBQJDwo'  

places_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

# API daily call limit
MAX_DAILY_CALLS = 6000  
CALLS_MADE = 0  

# Latitude and longitude range of Victoria
VIC_LAT_MIN = -39.159  
VIC_LAT_MAX = -33.981  
VIC_LNG_MIN = 140.961  
VIC_LNG_MAX = 149.976  

# Gridding Victoria's regions
GRID_STEP = 0.05  
# Save processed grid files
processed_locations_file = 'processed_locations.txt'

# Read the processed grid
if os.path.exists(processed_locations_file):
    with open(processed_locations_file, 'r') as file:
        processed_locations = set(file.read().splitlines())
else:
    processed_locations = set()

# Functions to search for hospitals and schools
def get_places_data(keyword, location, radius=5000):
    global CALLS_MADE
    if CALLS_MADE >= MAX_DAILY_CALLS:
        print("Reach the daily API call limit and suspend the operation.")
        return None

    params = {
        'location': location,  
        'radius': radius,  
        'type': keyword,  
        'key': API_KEY
    }

    response = requests.get(places_url, params=params)
    CALLS_MADE += 1  

    if response.status_code == 200:
        return response.json().get('results', [])
    else:
        print(f"Error: {response.status_code}")
        return None

# Convert results to DataFrame
def places_to_dataframe(places_data):
    if not places_data:
        return pd.DataFrame()

    places_list = []
    for place in places_data:
        place_info = {
            'name': place['name'],
            'address': place.get('vicinity'),
            'lat': place['geometry']['location']['lat'],
            'lng': place['geometry']['location']['lng'],
            'place_id': place['place_id']
        }
        places_list.append(place_info)
    
    return pd.DataFrame(places_list)

# Generate a Victorian-wide latitude/longitude grid
def generate_grid():
    latitudes = [VIC_LAT_MIN + i * GRID_STEP for i in range(int((VIC_LAT_MAX - VIC_LAT_MIN) / GRID_STEP) + 1)]
    longitudes = [VIC_LNG_MIN + i * GRID_STEP for i in range(int((VIC_LNG_MAX - VIC_LNG_MIN) / GRID_STEP) + 1)]
    
    grid = []
    for lat in latitudes:
        for lng in longitudes:
            grid.append(f"{lat},{lng}")
    
    return grid

# Batch fetch data and control the number of calls per day
def fetch_places_for_victoria(grid_locations):
    all_places = []

    for location in grid_locations:
        if location in processed_locations:
            continue  # Skip processed grids

        if CALLS_MADE >= MAX_DAILY_CALLS:
            print(f"Maximum number of calls per day {MAX_DAILY_CALLS} has been reached, continue crawling data tomorrow.")
            break

        # Access to hospital data
        hospitals = get_places_data('hospital', location)
        if hospitals:
            all_places.extend(hospitals)
        
        # Access to school data
        schools = get_places_data('school', location)
        if schools:
            all_places.extend(schools)

        # Mark the grid as processed
        processed_locations.add(location)

        # Control the frequency of requests to prevent being flow-limited
        time.sleep(2) 

    # Save processed grids
    with open(processed_locations_file, 'w') as file:
        file.write('\n'.join(processed_locations))

    return places_to_dataframe(all_places)

# Generate grid latitude and longitude for Victoria
grid_locations = generate_grid()

df_all_places = fetch_places_for_victoria(grid_locations)

output_folder = '../data/landing/External_data/school_hospital_data'
os.makedirs(output_folder, exist_ok=True)

output_path = os.path.join(output_folder, 'hospitals_and_schools_victoria.csv')
df_all_places.to_csv(output_path, index=False)