# AQI and Weather Data Collection

This notebook fetches historical air pollution data from OpenWeatherMap API and weather data from Open-Meteo API for a specific location and time range.

## Import Required Libraries

In [5]:
import requests
import csv
from datetime import datetime

## Configuration

Set the API key, location coordinates (Karachi), and date range for data collection.

In [6]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# --- CONFIGURATION ---
API_KEY = os.getenv("OPENWEATHER_API_KEY")
if not API_KEY:
    raise ValueError("OPENWEATHER_API_KEY not found in environment variables. Please set it in your .env file.")

LAT = 24.8607
LON = 67.0011
START_DATE = "2026-01-26 00:00:00"
END_DATE =   "2026-02-04 23:00:00"
FILENAME = "hourly_aqi_weather_data.csv"


## Data Fetching Function

This function fetches air pollution history from OpenWeatherMap and weather data from Open-Meteo (free, no API key needed), then combines them into a single dataset.

In [7]:
def get_data():
    import time
    start_time = time.time()
    
    # Convert dates to Unix timestamps
    start_ts = int(datetime.strptime(START_DATE, "%Y-%m-%d %H:%M:%S").timestamp())
    end_ts = int(datetime.strptime(END_DATE, "%Y-%m-%d %H:%M:%S").timestamp())
    print(f"[DEBUG] Time range: {START_DATE} to {END_DATE}")
    print(f"[DEBUG] Unix timestamps: {start_ts} to {end_ts}")

    # 1. Fetch Air Pollution History
    print(f"[DEBUG] Fetching air pollution history...")
    pollution_start = time.time()
    pollution_url = f"http://api.openweathermap.org/data/2.5/air_pollution/history"
    p_params = {'lat': LAT, 'lon': LON, 'start': start_ts, 'end': end_ts, 'appid': API_KEY}
    p_res = requests.get(pollution_url, params=p_params).json()
    pollution_list = p_res.get('list', [])
    print(f"[DEBUG] Pollution data fetched in {time.time() - pollution_start:.2f}s")
    print(f"[DEBUG] Total pollution records: {len(pollution_list)}")

    # 2. Fetch Weather History from Open-Meteo (free API, no key required)
    print(f"[DEBUG] Fetching weather data from Open-Meteo...")
    weather_start = time.time()
    
    # Open-Meteo API endpoint for historical weather data
    weather_url = "https://archive-api.open-meteo.com/v1/archive"
    
    # Format dates for Open-Meteo (YYYY-MM-DD)
    start_date_str = START_DATE.split()[0]
    end_date_str = END_DATE.split()[0]
    
    weather_params = {
        'latitude': LAT,
        'longitude': LON,
        'start_date': start_date_str,
        'end_date': end_date_str,
        'hourly': 'temperature_2m,relative_humidity_2m,pressure_msl,wind_speed_10m,wind_direction_10m,rain,direct_radiation',
        'timezone': 'auto'
    }
    
    weather_response = requests.get(weather_url, params=weather_params).json()
    print(f"[DEBUG] Weather data fetched in {time.time() - weather_start:.2f}s")
    
    # Create a dictionary mapping timestamps to weather data
    weather_data = {}
    if 'hourly' in weather_response:
        hourly = weather_response['hourly']
        times = hourly['time']
        
        for i, time_str in enumerate(times):
            # Convert ISO string to Unix timestamp
            dt = datetime.fromisoformat(time_str.replace('Z', '+00:00'))
            ts = int(dt.timestamp())
            
            weather_data[ts] = {
                'temp_c': hourly['temperature_2m'][i],
                'humidity_pct': hourly['relative_humidity_2m'][i],
                'pressure_hpa': hourly['pressure_msl'][i],
                'wind_speed_kmh': hourly['wind_speed_10m'][i],
                'wind_dir_deg': hourly['wind_direction_10m'][i],
                'rain_mm': hourly['rain'][i],
                'solar_rad_wm2': hourly['direct_radiation'][i]
            }
    
    print(f"[DEBUG] Weather data indexed: {len(weather_data)} hourly records")
    
    # 3. Combine pollution and weather data
    combined_data = []
    print(f"[DEBUG] Combining pollution and weather data...")

    for idx, p in enumerate(pollution_list):
        ts = p['dt']
        dt_obj = datetime.fromtimestamp(ts)
        
        if idx % 100 == 0:  # Print progress every 100 records
            print(f"[DEBUG] Processing record {idx+1}/{len(pollution_list)} ({dt_obj})")
        
        # Get weather data for this timestamp
        weather = weather_data.get(ts, {})
        
        # Map all required fields
        row = {
            "timestamp": ts,
            "year": dt_obj.year,
            "month": dt_obj.month,
            "day": dt_obj.day,
            "hour": dt_obj.hour,
            "epa_aqi": p['main']['aqi'],
            "pm2_5": p['components']['pm2_5'],
            "pm10": p['components']['pm10'],
            "co": p['components']['co'],
            "no2": p['components']['no2'],
            "so2": p['components']['so2'],
            "o3": p['components']['o3'],
            "temp_c": weather.get('temp_c'),
            "humidity_pct": weather.get('humidity_pct'),
            "pressure_hpa": weather.get('pressure_hpa'),
            "wind_speed_kmh": weather.get('wind_speed_kmh'),
            "wind_dir_deg": weather.get('wind_dir_deg'),
            "rain_mm": weather.get('rain_mm'),
            "solar_rad_wm2": weather.get('solar_rad_wm2')
        }
        combined_data.append(row)

    print(f"[DEBUG] Data combination complete. Writing to CSV...")
    
    # 4. Write to CSV
    if combined_data:
        csv_start = time.time()
        keys = combined_data[0].keys()
        with open(FILENAME, 'w', newline='') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(combined_data)
        print(f"[DEBUG] CSV write completed in {time.time() - csv_start:.2f}s")
        print(f"Success! Data saved to {FILENAME}")
        print(f"[DEBUG] Total records written: {len(combined_data)}")
    else:
        print("No data found.")
    
    total_time = time.time() - start_time
    print(f"[DEBUG] Total execution time: {total_time:.2f}s ({total_time/60:.2f} minutes)")

## Run Data Collection

Execute the function to fetch and save the data.

In [8]:
get_data()

[DEBUG] Time range: 2026-01-26 00:00:00 to 2026-02-04 23:00:00
[DEBUG] Unix timestamps: 1769367600 to 1770228000
[DEBUG] Fetching air pollution history...
[DEBUG] Pollution data fetched in 1.58s
[DEBUG] Total pollution records: 216
[DEBUG] Fetching weather data from Open-Meteo...
[DEBUG] Weather data fetched in 0.81s
[DEBUG] Weather data indexed: 240 hourly records
[DEBUG] Combining pollution and weather data...
[DEBUG] Processing record 1/216 (2026-01-26 00:00:00)
[DEBUG] Processing record 101/216 (2026-01-31 04:00:00)
[DEBUG] Processing record 201/216 (2026-02-04 08:00:00)
[DEBUG] Data combination complete. Writing to CSV...
[DEBUG] CSV write completed in 0.00s
Success! Data saved to hourly_aqi_weather_data.csv
[DEBUG] Total records written: 216
[DEBUG] Total execution time: 2.40s (0.04 minutes)
