# Data Acquisition: Open-Meteo Historical Weather API

This notebook downloads historical weather data from Open-Meteo API for rapid prototyping and pipeline validation.

## Objectives
1. Download historical weather data for selected stations
2. Validate data pipeline before scaling to NOAA ISD
3. Test data loading and preprocessing functions
4. Create sample dataset for development

In [1]:
# Imports
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import json

# Setup project path - this adds project root to Python path
# First, we need to temporarily add a path to import the utility
# We'll use a simple approach that works from any notebook location
current_dir = Path(os.getcwd()).resolve()

# Find project root by looking for src/ and notebooks/ directories
if (current_dir / 'src').exists() and (current_dir / 'notebooks').exists():
    project_root = current_dir
elif current_dir.name in ['01_data_acquisition', '02_data_preprocessing', '03_baselines', 
                           '04_gnn_models', '05_training', '06_analysis', '07_evaluation', '08_documentation']:
    project_root = current_dir.parent.parent
elif current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    # Walk up to find project root
    for parent in current_dir.parents:
        if (parent / 'src').exists() and (parent / 'notebooks').exists():
            project_root = parent
            break
    else:
        project_root = current_dir

# Add project root to Python path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Now we can import from src
from src.utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

print(f"Project root: {project_root}")
print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")

Project root: C:\Users\Kata\Desktop\earth-sgnn
Raw data directory: C:\Users\Kata\Desktop\earth-sgnn\data\raw
Processed data directory: C:\Users\Kata\Desktop\earth-sgnn\data\processed


## 1. Define Station Locations

We'll start with a small set of stations for prototyping. These are major cities with good data coverage.

In [2]:
# Sample stations for prototyping (major cities)
STATIONS = {
    'New York': {'lat': 40.7128, 'lon': -74.0060, 'elevation': 10},
    'Los Angeles': {'lat': 34.0522, 'lon': -118.2437, 'elevation': 71},
    'Chicago': {'lat': 41.8781, 'lon': -87.6298, 'elevation': 182},
    'Houston': {'lat': 29.7604, 'lon': -95.3698, 'elevation': 13},
    'Phoenix': {'lat': 33.4484, 'lon': -112.0740, 'elevation': 331},
    'Philadelphia': {'lat': 39.9526, 'lon': -75.1652, 'elevation': 12},
    'San Antonio': {'lat': 29.4241, 'lon': -98.4936, 'elevation': 198},
    'San Diego': {'lat': 32.7157, 'lon': -117.1611, 'elevation': 19},
    'Dallas': {'lat': 32.7767, 'lon': -96.7970, 'elevation': 131},
    'San Jose': {'lat': 37.3382, 'lon': -121.8863, 'elevation': 26}
}

print(f"Number of stations: {len(STATIONS)}")
print("\nStation locations:")
for name, info in STATIONS.items():
    print(f"  {name}: ({info['lat']:.4f}, {info['lon']:.4f}), elevation: {info['elevation']}m")

Number of stations: 10

Station locations:
  New York: (40.7128, -74.0060), elevation: 10m
  Los Angeles: (34.0522, -118.2437), elevation: 71m
  Chicago: (41.8781, -87.6298), elevation: 182m
  Houston: (29.7604, -95.3698), elevation: 13m
  Phoenix: (33.4484, -112.0740), elevation: 331m
  Philadelphia: (39.9526, -75.1652), elevation: 12m
  San Antonio: (29.4241, -98.4936), elevation: 198m
  San Diego: (32.7157, -117.1611), elevation: 19m
  Dallas: (32.7767, -96.7970), elevation: 131m
  San Jose: (37.3382, -121.8863), elevation: 26m


## 2. Define Date Range

We'll download data for the past 2 years for prototyping.

In [3]:
# Date range for data download
# Note: Open-Meteo archive API typically has data up to a few days ago
# We'll use yesterday as end date to avoid future date issues
END_DATE = datetime.now() - timedelta(days=1)  # Yesterday (to ensure data is available)
START_DATE = END_DATE - timedelta(days=730)  # 2 years of data

print(f"Start date: {START_DATE.strftime('%Y-%m-%d')}")
print(f"End date: {END_DATE.strftime('%Y-%m-%d')}")
print(f"Total days: {(END_DATE - START_DATE).days}")

Start date: 2024-02-05
End date: 2026-02-04
Total days: 730


## 3. Open-Meteo API Function

Function to download historical weather data from Open-Meteo.

In [4]:
def download_openmeteo_data(latitude, longitude, start_date, end_date, station_name=None):
    """
    Download historical weather data from Open-Meteo API.
    
    Parameters:
    -----------
    latitude : float
        Latitude of the location
    longitude : float
        Longitude of the location
    start_date : datetime
        Start date for data
    end_date : datetime
        End date for data
    station_name : str, optional
        Name of the station (for logging)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with weather data
    """
    
    # Open-Meteo API endpoint
    url = "https://archive-api.open-meteo.com/v1/archive"
    
    # Parameters
    # Note: hourly must be a comma-separated string, not a list
    hourly_variables = [
        "temperature_2m",
        "relative_humidity_2m",
        "dewpoint_2m",
        "wind_speed_10m",
        "wind_direction_10m",
        "surface_pressure"
    ]
    
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": end_date.strftime("%Y-%m-%d"),
        "hourly": ",".join(hourly_variables),  # Join list into comma-separated string
        "timezone": "UTC"
    }
    
    response = None
    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        # Check if hourly data exists
        if 'hourly' not in data:
            print(f"Warning: No hourly data in response for {station_name or f'({latitude}, {longitude})'}")
            return None
        
        # Convert to DataFrame
        df = pd.DataFrame(data['hourly'])
        
        if len(df) == 0:
            print(f"Warning: Empty data for {station_name or f'({latitude}, {longitude})'}")
            return None
        
        # Convert time to datetime
        df['time'] = pd.to_datetime(df['time'])
        
        # Add station metadata
        df['latitude'] = latitude
        df['longitude'] = longitude
        if station_name:
            df['station_name'] = station_name
        
        # Rename columns for consistency
        column_mapping = {
            'time': 'timestamp',
            'temperature_2m': 'temperature_2m',
            'relative_humidity_2m': 'relative_humidity_2m',
            'dewpoint_2m': 'dewpoint_2m',
            'wind_speed_10m': 'wind_speed_10m',
            'wind_direction_10m': 'wind_direction_10m',
            'surface_pressure': 'surface_pressure'
        }
        df = df.rename(columns=column_mapping)
        
        return df
        
    except requests.exceptions.HTTPError as e:
        # Try to get error details from response
        error_msg = str(e)
        response_url = ""
        try:
            if e.response is not None:
                if hasattr(e.response, 'json'):
                    try:
                        error_data = e.response.json()
                        error_msg = error_data.get('reason', error_data.get('error', str(e)))
                    except:
                        pass
                if hasattr(e.response, 'url'):
                    response_url = e.response.url
        except:
            pass
        print(f"HTTP Error for {station_name or f'({latitude}, {longitude})'}: {error_msg}")
        if response_url:
            print(f"  URL: {response_url}")
        elif response and hasattr(response, 'url'):
            print(f"  URL: {response.url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Request error for {station_name or f'({latitude}, {longitude})'}: {e}")
        return None
    except KeyError as e:
        print(f"Error parsing response for {station_name or f'({latitude}, {longitude})'}: {e}")
        print(f"  Response keys: {list(data.keys()) if 'data' in locals() else 'N/A'}")
        return None
    except Exception as e:
        print(f"Unexpected error for {station_name or f'({latitude}, {longitude})'}: {e}")
        return None

## 4. Download Data for All Stations

Download data for all stations with progress tracking.

In [5]:
# Download data for all stations
all_data = []
failed_stations = []

print("Downloading data from Open-Meteo API...")
print(f"Total stations: {len(STATIONS)}")
print("-" * 50)

for station_name, station_info in tqdm(STATIONS.items(), desc="Stations"):
    lat = station_info['lat']
    lon = station_info['lon']
    
    df = download_openmeteo_data(
        latitude=lat,
        longitude=lon,
        start_date=START_DATE,
        end_date=END_DATE,
        station_name=station_name
    )
    
    if df is not None and len(df) > 0:
        all_data.append(df)
        print(f"✓ {station_name}: {len(df)} records")
    else:
        failed_stations.append(station_name)
        print(f"✗ {station_name}: Failed to download")
    
    # Rate limiting: be respectful to the API
    time.sleep(0.5)  # 0.5 second delay between requests

print("-" * 50)
print(f"Successfully downloaded: {len(all_data)}/{len(STATIONS)} stations")
if failed_stations:
    print(f"Failed stations: {failed_stations}")

Downloading data from Open-Meteo API...
Total stations: 10
--------------------------------------------------


Stations:   0%|          | 0/10 [00:00<?, ?it/s]

✓ New York: 17544 records


Stations:  10%|█         | 1/10 [00:02<00:18,  2.07s/it]

✓ Los Angeles: 17544 records


Stations:  20%|██        | 2/10 [00:04<00:16,  2.03s/it]

✓ Chicago: 17544 records


Stations:  30%|███       | 3/10 [00:06<00:14,  2.02s/it]

✓ Houston: 17544 records


Stations:  40%|████      | 4/10 [00:07<00:11,  1.98s/it]

✓ Phoenix: 17544 records


Stations:  50%|█████     | 5/10 [00:09<00:09,  1.94s/it]

✓ Philadelphia: 17544 records


Stations:  60%|██████    | 6/10 [00:14<00:10,  2.74s/it]

✓ San Antonio: 17544 records


Stations:  70%|███████   | 7/10 [00:16<00:07,  2.49s/it]

✓ San Diego: 17544 records


Stations:  80%|████████  | 8/10 [00:18<00:04,  2.33s/it]

✓ Dallas: 17544 records


Stations:  90%|█████████ | 9/10 [00:20<00:02,  2.23s/it]

✓ San Jose: 17544 records


Stations: 100%|██████████| 10/10 [00:22<00:00,  2.20s/it]

--------------------------------------------------
Successfully downloaded: 10/10 stations





## 5. Combine and Validate Data

Combine all station data and perform basic validation.

In [6]:
# Combine all data
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    
    print(f"Total records: {len(combined_df):,}")
    print(f"Date range: {combined_df['timestamp'].min()} to {combined_df['timestamp'].max()}")
    print(f"Number of stations: {combined_df['station_name'].nunique()}")
    print(f"\nStations: {', '.join(combined_df['station_name'].unique())}")
    
    print("\n" + "=" * 50)
    print("Data Summary")
    print("=" * 50)
    print(combined_df.info())
    
    print("\n" + "=" * 50)
    print("Missing Values")
    print("=" * 50)
    missing = combined_df.isnull().sum()
    missing_pct = (missing / len(combined_df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    print("\n" + "=" * 50)
    print("Statistical Summary")
    print("=" * 50)
    print(combined_df.describe())
else:
    print("No data downloaded. Please check the API connection and station locations.")

Total records: 175,440
Date range: 2024-02-05 00:00:00 to 2026-02-04 23:00:00
Number of stations: 10

Stations: New York, Los Angeles, Chicago, Houston, Phoenix, Philadelphia, San Antonio, San Diego, Dallas, San Jose

Data Summary
<class 'pandas.DataFrame'>
RangeIndex: 175440 entries, 0 to 175439
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   timestamp             175440 non-null  datetime64[us]
 1   temperature_2m        175440 non-null  float64       
 2   relative_humidity_2m  175440 non-null  int64         
 3   dewpoint_2m           175440 non-null  float64       
 4   wind_speed_10m        175440 non-null  float64       
 5   wind_direction_10m    175440 non-null  int64         
 6   surface_pressure      175440 non-null  float64       
 7   latitude              175440 non-null  float64       
 8   longitude             175440 non-null  float64       
 9   station_name   

## 6. Save Data

Save the downloaded data for future use.

In [7]:
if all_data:
    # Save raw data
    raw_data_file = RAW_DATA_DIR / "openmeteo_raw_data.parquet"
    combined_df.to_parquet(raw_data_file, index=False)
    print(f"Raw data saved to: {raw_data_file}")
    
    # Also save as CSV for easy inspection
    csv_file = RAW_DATA_DIR / "openmeteo_raw_data.csv"
    combined_df.to_csv(csv_file, index=False)
    print(f"CSV backup saved to: {csv_file}")
    
    # Save station metadata
    station_metadata = []
    for name, info in STATIONS.items():
        if name in combined_df['station_name'].values:
            station_metadata.append({
                'station_name': name,
                'latitude': info['lat'],
                'longitude': info['lon'],
                'elevation': info['elevation']
            })
    
    metadata_df = pd.DataFrame(station_metadata)
    metadata_file = RAW_DATA_DIR / "openmeteo_station_metadata.csv"
    metadata_df.to_csv(metadata_file, index=False)
    print(f"Station metadata saved to: {metadata_file}")
    
    print("\n✓ Data download complete!")

Raw data saved to: C:\Users\Kata\Desktop\earth-sgnn\data\raw\openmeteo_raw_data.parquet
CSV backup saved to: C:\Users\Kata\Desktop\earth-sgnn\data\raw\openmeteo_raw_data.csv
Station metadata saved to: C:\Users\Kata\Desktop\earth-sgnn\data\raw\openmeteo_station_metadata.csv

✓ Data download complete!
