In [1]:
import os
import glob
import asyncio
import nest_asyncio
import aiohttp

from functions import get_ID_for_download, parse_data, save_to_csv

# Allow nested event loops
nest_asyncio.apply()

In [2]:
# Define folders
queryfolder = "./query/"
idfolder = "./IDS/"
OUTPUT_FOLDER = "./earthquake_data/"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Generate ID files from catalogues
catalogue = get_ID_for_download(queryfolder, idfolder)
print(f"‚úÖ Catalogue prepared with {len(catalogue)} earthquake entries.")

‚úÖ Catalogue prepared with 38684 earthquake entries.


In [3]:
error_count = 0
error_list = []

async def fetch_data(session, public_id):
    """
    Fetch and parse one earthquake event.

    Input:
    session : aiohttp.ClientSession
        The active HTTP session used for making asynchronous requests.
    public_id : str
        The unique GeoNet event ID to be fetched.

    Output:
    dict or None
        Parsed data dictionary for a single earthquake event (from `parse_data()`),
        or None if an error occurs or if the request fails.
    
    """
    
    global error_count

    # Set up event URL
    url = f"https://www.geonet.org.nz/earthquake/technical/{public_id}"
    
    try:
        # Asynchronously send a GET request to the GeoNet event page
        async with session.get(url) as response:
            
            # If the request fails (e.g., 404 or 500), record the error and skip this event
            if response.status != 200:
                error_count += 1
                error_list.append(f"{error_count}. Error fetching {public_id}: HTTP {response.status}")
                return None

            # If successful, extract the HTML content               
            html = await response.text()
            
           # Parse the event‚Äôs data (defined in your `functions.py`)
            return parse_data(html, public_id)

    
    # Handle any other errors (e.g., connection timeouts, invalid responses)            
    except Exception as e:
        error_count += 1
        error_list.append(f"{error_count}. Error fetching {public_id}: {e}")
        return None



async def gather_data(public_ids):
    """
    Fetch and parse multiple GeoNet events concurrently.

    Input
    public_ids : list of str
        List of GeoNet public event IDs to fetch.

    Output
    list of dict
        A list of parsed event dictionaries (same format as from `fetch_data()`),
        with `None` for events that failed to fetch or parse. 
        
    """
    
    # Create a single session for all requests to avoid repeated connection setup    
    async with aiohttp.ClientSession() as session:
        
        # Create a list of coroutine tasks, one per event ID
        tasks = [fetch_data(session, pub_id) for pub_id in public_ids]
        
        # Run all the tasks concurrently and wait for all to finish
        return await asyncio.gather(*tasks)


In [5]:
async def main():
    """
    Main asynchronous driver function.
    Loops through all `.dat` files generated by `get_ID_for_download()`,
    reads the event IDs from each file, fetches the corresponding GeoNet technical pages,
    parses the data, and saves the results into `.csv` files.
    
    """
    
    id_files = glob.glob(os.path.join(idfolder, "*.dat"))
    print(f"üìÇ Found {len(id_files)} ID files to process.")

    for id_file in id_files:
        basename = os.path.basename(id_file).replace("_ID.dat", "")
        output_file = f"{basename}_earthquake_data.csv"

        # Read IDs
        with open(id_file, 'r') as f:
            public_ids = [line.strip() for line in f.readlines() if line.strip()]

        print(f"\n‚öôÔ∏è Fetching {len(public_ids)} events for {basename}...")

        # Fetch data
        data_list = await gather_data(public_ids)

        # Save results
        save_to_csv(data_list, filename=output_file, output_folder=OUTPUT_FOLDER)

        print(f"‚úÖ Finished {basename}, saved {len([d for d in data_list if d])} entries.")

    if error_list:
        print("\n‚ö†Ô∏è Some errors occurred:")
        for err in error_list:
            print(err)
    else:
        print("\nAll done with no errors!")


IndentationError: unindent does not match any outer indentation level (<string>, line 10)

In [12]:
await main()

üìÇ Found 4 ID files to process.

‚öôÔ∏è Fetching 11151 events for NZ_2024_01-06...
‚úÖ Finished NZ_2024_01-06, saved 11151 entries.

‚öôÔ∏è Fetching 6216 events for NZ_2025_07-12...
‚úÖ Finished NZ_2025_07-12, saved 6216 entries.

‚öôÔ∏è Fetching 11059 events for NZ_2025_01-06...
‚úÖ Finished NZ_2025_01-06, saved 11059 entries.

‚öôÔ∏è Fetching 10258 events for NZ_2024_07-12...
‚úÖ Finished NZ_2024_07-12, saved 10258 entries.

üéâ All done with no errors!
