In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Define the download directory (Change this to your desired directory)
data_dir = 'aemo_data'  # Change this path if needed
os.makedirs(data_dir, exist_ok=True)  # Ensure the directory exists

# Base URL for the report page
base_url = "https://www.nemweb.com.au/REPORTS/CURRENT/HighImpactOutages/"

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Date range for filtering files
start_year, start_month = 2021, 8  # Aug 2021
end_year, end_month = 2024, 8      # Aug 2024

# Skip these specific files
skip_files = [
    "7_days_High_Impact_Outages_20230116.csv",
    "7_days_High_Impact_Outages_22021128.csv"
]

# Regex pattern to extract dates from filenames
date_pattern = re.compile(r"High_Impact_Outages_(\d{4})(\d{2})(\d{2})\.csv")

# Fetch the webpage with all the file links
response = requests.get(base_url, headers=headers)
if response.status_code != 200:
    print(f"Failed to fetch webpage: {response.status_code}")
    exit()

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

# Find all CSV links on the page
for link in soup.find_all("a", href=True):
    file_name = link["href"]

    # Ensure it's a CSV file
    if not file_name.endswith(".csv"):
        continue

    # Skip the files in the skip list
    if file_name in skip_files:
        print(f"Skipping {file_name} (in skip list)")
        continue

    # Extract YYYYMMDD from the filename using regex
    match = date_pattern.search(file_name)
    if match:
        year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))

        # Check if the file's date is within the specified range
        if (start_year < year < end_year) or (year == start_year and month >= start_month) or (year == end_year and month <= end_month):
            file_url = urljoin(base_url, file_name)
            
            # Use os.path.basename to get the filename without leading slashes
            file_name = os.path.basename(file_name)  # Ensure no leading slashes in filename
            file_path = os.path.join(data_dir, file_name)

            # Print debug to verify the correct file path
            print(f"Saving file to: {file_path}")

            # Skip download if the file already exists
            if os.path.exists(file_path):
                print(f"Skipping {file_name} - already exists")
                continue

            # Download the file
            try:
                print(f"Downloading: {file_name}")
                file_response = requests.get(file_url, headers=headers, stream=True)
                file_response.raise_for_status()  # Raise an error if the request fails

                # Save the file to the download directory
                with open(file_path, "wb") as file:
                    for chunk in file_response.iter_content(chunk_size=1024):
                        file.write(chunk)

            except requests.exceptions.RequestException as e:
                print(f"Failed to download {file_name}: {e}")

            time.sleep(1)  # Pause to avoid overwhelming the server

print("Filtered CSV files have been downloaded.")


Saving file to: aemo_data\7_days_High_Impact_Outages_20230116.csv
Skipping 7_days_High_Impact_Outages_20230116.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20210830.csv
Skipping High_Impact_Outages_20210830.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20210906.csv
Skipping High_Impact_Outages_20210906.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20210913.csv
Skipping High_Impact_Outages_20210913.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20210920.csv
Skipping High_Impact_Outages_20210920.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20210927.csv
Skipping High_Impact_Outages_20210927.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20211005.csv
Skipping High_Impact_Outages_20211005.csv - already exists
Saving file to: aemo_data\High_Impact_Outages_20211011.csv
Skipping High_Impact_Outages_20211011.csv - already exists
Saving file to: aemo_data\High_Impact_Outa