In [1]:
import requests
import os
from datetime import datetime, timedelta
import pandas as pd

# Base URL and output directory
base_url = "https://files.airnowtech.org/airnow/"
filtered_dir = "FilteredData1"
os.makedirs(filtered_dir, exist_ok=True)

# Target location and parameters
TARGET_LOCATION = "Los Angeles - N. Mai"
TARGET_PARAMETERS = {"PM2.5", "PM10"}

def download_and_filter_day(date):
    year = date.strftime("%Y")
    date_str = date.strftime("%Y%m%d")
    data_dict = {}

    print(f"\n📆 Processing date: {date.date()}")

    for hour in range(24):
        hour_str = f"{hour:02d}"
        file_name = f"HourlyData_{date_str}{hour_str}.dat"
        file_url = f"{base_url}{year}/{date_str}/{file_name}"

        print(f"⏳ Downloading hour: {hour_str}:00 from {file_name}...")
        response = requests.get(file_url)

        if response.status_code == 200:
            lines = response.text.strip().split('\n')
            found = False

            for line in lines:
                parts = line.strip().split('|')
                if len(parts) >= 9:
                    date_val, time, sitecode, location, timezone, parameter, unit, value, agency = [p.strip() for p in parts]

                    if location == TARGET_LOCATION and parameter.upper() in TARGET_PARAMETERS:
                        key = (date_val, time, sitecode)  # unique per time
                        if key not in data_dict:
                            data_dict[key] = {
                                "Date": date_val,
                                "Time": time,
                                "SiteCode": sitecode,
                                "Location": location,
                                "Timezone": timezone,
                                "Unit": unit,
                                "Agency": agency,
                                "PM2.5": None,
                                "PM10": None
                            }

                        if parameter.upper() == "PM2.5":
                            data_dict[key]["PM2.5"] = float(value)
                        elif parameter.upper() == "PM10":
                            data_dict[key]["PM10"] = float(value)

                        found = True

            if found:
                print(f"✅ Relevant data found for {hour_str}:00")
            else:
                print(f"⚠ No matching records in this hour.")

        elif response.status_code == 404:
            print(f"⛔ File not found: {file_name}")
        else:
            print(f"⚠ Error {response.status_code} while fetching: {file_url}")

    all_data = list(data_dict.values())

    if all_data:
        filtered_file_name = f"Filtered_HourlyData_{date_str}.csv"
        print(f"\n📝 Writing combined data for {date_str} to CSV...")
        df = pd.DataFrame(all_data)
        df.to_csv(os.path.join(filtered_dir, filtered_file_name), index=False)
        print(f"✅ Saved full-day data for {date_str}: {filtered_file_name}")
    else:
        print(f"⚠ No matching data found for {date.date()}")

    return all_data


def calculate_weekly_monthly_averages(all_data):
    all_rows = []
    for daily_data in all_data:
        all_rows.extend(daily_data)
    
    df_all = pd.DataFrame(all_rows)

    if df_all.empty:
        print("⚠ No data to calculate averages.")
        return df_all

    df_all['Datetime'] = pd.to_datetime(df_all['Date'] + ' ' + df_all['Time'])
    df_all['Week'] = df_all['Datetime'].dt.isocalendar().week
    df_all['Month'] = df_all['Datetime'].dt.month

    weekly_avg = df_all.groupby('Week')[['PM2.5', 'PM10']].mean().reset_index()
    monthly_avg = df_all.groupby('Month')[['PM2.5', 'PM10']].mean().reset_index()

    df_all = df_all.merge(weekly_avg, on='Week', suffixes=('', '_Weekly_Avg'))
    df_all = df_all.merge(monthly_avg, on='Month', suffixes=('', '_Monthly_Avg'))

    return df_all

def generate_and_download_files(start_date, end_date):
    all_data = []
    current_date = start_date
    while current_date <= end_date:
        daily_data = download_and_filter_day(current_date)
        all_data.append(daily_data)
        current_date += timedelta(days=1)

    print("\n✅ All daily CSVs created.")

    combined_data = calculate_weekly_monthly_averages(all_data)
    if not combined_data.empty:
        combined_file_name = "Combined_Daily_Data.csv"
        combined_data.to_csv(os.path.join(filtered_dir, combined_file_name), index=False)
        print(f"✅ Combined daily data saved to {combined_file_name}")
    else:
        print("❌ No combined data to save.")

# Example usage:
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)  # change this to a bigger range when testing is done

print(f"📥 Starting filtered download from {start_date.date()} to {end_date.date()}")
generate_and_download_files(start_date, end_date)
print("\n✅ All filtered downloads complete.")


📥 Starting filtered download from 2024-01-01 to 2024-12-31

📆 Processing date: 2024-01-01
⏳ Downloading hour: 00:00 from HourlyData_2024010100.dat...
✅ Relevant data found for 00:00
⏳ Downloading hour: 01:00 from HourlyData_2024010101.dat...
✅ Relevant data found for 01:00
⏳ Downloading hour: 02:00 from HourlyData_2024010102.dat...
✅ Relevant data found for 02:00
⏳ Downloading hour: 03:00 from HourlyData_2024010103.dat...
✅ Relevant data found for 03:00
⏳ Downloading hour: 04:00 from HourlyData_2024010104.dat...
✅ Relevant data found for 04:00
⏳ Downloading hour: 05:00 from HourlyData_2024010105.dat...
✅ Relevant data found for 05:00
⏳ Downloading hour: 06:00 from HourlyData_2024010106.dat...
✅ Relevant data found for 06:00
⏳ Downloading hour: 07:00 from HourlyData_2024010107.dat...
✅ Relevant data found for 07:00
⏳ Downloading hour: 08:00 from HourlyData_2024010108.dat...
✅ Relevant data found for 08:00
⏳ Downloading hour: 09:00 from HourlyData_2024010109.dat...
✅ Relevant data found 

  df_all['Datetime'] = pd.to_datetime(df_all['Date'] + ' ' + df_all['Time'])


✅ Combined daily data saved to Combined_Daily_Data.csv

✅ All filtered downloads complete.
