# Wind Data Loading and Preprocessing - 500hPa (Downsampled)

This notebook loads and preprocesses ERA5 wind data at 500hPa pressure level with **10x downsampling** for efficient graph-based wind interpolation. The downsampling reduces computational load from ~1M nodes to ~10K nodes while maintaining essential spatial patterns.

**Key Features:**
- 10x downsampling in both lat/lon directions (721×1440 → 73×144 grid)
- Consistent node indexing for graph-based methods
- Normalized wind speeds (mean=0, std=1) 
- Aeolus satellite track for training data
- Sparse adjacency matrix with geodesic edge weights

In [1]:
# =============================================================================
# IMPORT LIBRARIES AND CONFIGURATION
# =============================================================================

from netCDF4 import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from datetime import datetime, timedelta
from skyfield.api import load, EarthSatellite, wgs84, utc

# Configuration parameters
DOWNSAMPLE_FACTOR = 10      # Downsample by factor of 10 in both lat/lon directions
PRESSURE_LEVEL = 500        # hPa pressure level
PRESSURE_INDEX = 0          # 500hPa is index 0 in the dataset
DATA_FILE = '../8176c14c59fd8dc32a74a89b926cb7fd.nc'

print(f"Configuration:")
print(f"  Pressure Level: {PRESSURE_LEVEL} hPa")
print(f"  Downsampling Factor: {DOWNSAMPLE_FACTOR}x")
print(f"  Expected grid reduction: ~100x fewer nodes")

Configuration:
  Pressure Level: 500 hPa
  Downsampling Factor: 10x
  Expected grid reduction: ~100x fewer nodes


In [2]:
# =============================================================================
# LOAD NETCDF WIND DATA
# =============================================================================

# Load the NetCDF dataset
dataset = Dataset(DATA_FILE, mode="r")
print("Available variables:", list(dataset.variables.keys()))

# Load coordinate arrays
lat = dataset.variables["latitude"][:]      # shape (721,)
lon = dataset.variables["longitude"][:]     # shape (1440,)

# Load wind components (eastward and northward)
# Dimensions: (valid_time=12, pressure_level=3, latitude=721, longitude=1440)
u = dataset.variables["u"][:]   # eastward wind
v = dataset.variables["v"][:]   # northward wind

# Extract wind components for 500hPa pressure level
u_500 = u[0, PRESSURE_INDEX, :, :]  # shape (721, 1440)
v_500 = v[0, PRESSURE_INDEX, :, :]  # shape (721, 1440)

print(f"\nData shapes:")
print(f"  Latitude: {lat.shape}")
print(f"  Longitude: {lon.shape}")
print(f"  U component: {u_500.shape}")
print(f"  V component: {v_500.shape}")
print(f"  Original grid size: {lat.shape[0]} × {lon.shape[0]} = {lat.shape[0] * lon.shape[0]:,} points")

Available variables: ['number', 'valid_time', 'pressure_level', 'latitude', 'longitude', 'expver', 'u', 'v']

Data shapes:
  Latitude: (721,)
  Longitude: (1440,)
  U component: (721, 1440)
  V component: (721, 1440)
  Original grid size: 721 × 1440 = 1,038,240 points

Data shapes:
  Latitude: (721,)
  Longitude: (1440,)
  U component: (721, 1440)
  V component: (721, 1440)
  Original grid size: 721 × 1440 = 1,038,240 points


In [3]:
# =============================================================================
# APPLY DOWNSAMPLING
# =============================================================================

def downsample_grid_data(lat, lon, u_data, v_data, factor=10):
    """
    Downsample the lat/lon grid and corresponding data by a given factor
    
    Parameters:
    -----------
    lat : array
        Latitude coordinates
    lon : array  
        Longitude coordinates
    u_data : array
        U wind component data (lat, lon)
    v_data : array
        V wind component data (lat, lon)
    factor : int
        Downsampling factor
        
    Returns:
    --------
    lat_down, lon_down, u_down, v_down : downsampled arrays
    """
    # Downsample coordinates
    lat_down = lat[::factor]
    lon_down = lon[::factor]
    
    # Downsample data using the same indices
    u_down = u_data[::factor, ::factor]
    v_down = v_data[::factor, ::factor]
    
    print(f"Downsampling Results:")
    print(f"  Original grid: {lat.shape[0]} × {lon.shape[0]} = {len(lat) * len(lon):,} points")
    print(f"  Downsampled grid: {lat_down.shape[0]} × {lon_down.shape[0]} = {len(lat_down) * len(lon_down):,} points")
    print(f"  Reduction factor: {(len(lat) * len(lon)) / (len(lat_down) * len(lon_down)):.1f}x smaller")
    print(f"  Resolution: {(lat_down[1] - lat_down[0]):.2f}° lat × {(lon_down[1] - lon_down[0]):.2f}° lon")
    
    return lat_down, lon_down, u_down, v_down

# Apply downsampling
lat_processed, lon_processed, u_500_processed, v_500_processed = downsample_grid_data(
    lat, lon, u_500, v_500, factor=DOWNSAMPLE_FACTOR
)

# Calculate wind speed magnitude for the downsampled data
wind_speed_processed = np.sqrt(u_500_processed**2 + v_500_processed**2)

# Save the downsampled data
np.savez(f'wind_data_{PRESSURE_LEVEL}hPa_downsampled.npz', 
         lon=lon_processed, lat=lat_processed, 
         u=u_500_processed, v=v_500_processed, wind_speed=wind_speed_processed)

Downsampling Results:
  Original grid: 721 × 1440 = 1,038,240 points
  Downsampled grid: 73 × 144 = 10,512 points
  Reduction factor: 98.8x smaller
  Resolution: -2.50° lat × 2.50° lon


In [4]:
# =============================================================================
# GENERATE AEOLUS SATELLITE TRACK
# =============================================================================

# Define Aeolus TLE (Two-Line Element) data
line1 = "1 43600U 18066A   21153.73585495  .00031128  00000-0  12124-3 0  9990"
line2 = "2 43600  96.7150 160.8035 0006915  90.4181 269.7884 15.87015039160910"

ts = load.timescale()
aeolus = EarthSatellite(line1, line2, "AEOLUS", ts)

# Generate times (every minute for 24h starting Jan 1, 2019 09:00 UTC)
start = datetime(2019, 1, 1, 9, tzinfo=utc)
stop = start + timedelta(hours=24)
step = timedelta(minutes=1)

times = []
t = start
while t <= stop:
    times.append(t)
    t += step

# Propagate orbit to get lat/lon
geocentric = aeolus.at(ts.from_datetimes(times))
sat_lat, sat_lon = wgs84.latlon_of(geocentric)

# Convert to arrays in degrees
sat_lat = sat_lat.degrees
sat_lon = sat_lon.degrees % 360  # wrap to [0,360)

raw_track = pd.DataFrame({"time": times, "lat": sat_lat, "lon": sat_lon})

print(f"Generated Aeolus track:")
print(f"  Duration: 24 hours")
print(f"  Sampling: 1 minute intervals") 
print(f"  Total points: {len(times)}")

# Snap to downsampled ERA5 grid
def snap_to_grid(lat_val, lon_val, era5_lat, era5_lon):
    i = np.abs(era5_lat - lat_val).argmin()
    j = np.abs(era5_lon - lon_val).argmin()
    return era5_lat[i], era5_lon[j]

snapped = [snap_to_grid(phi, lam, lat_processed, lon_processed) for phi, lam in zip(sat_lat, sat_lon)]
snap_lat, snap_lon = zip(*snapped)

snapped_track = pd.DataFrame({"time": times, "lat": snap_lat, "lon": snap_lon})

print(f"\nSnapped to downsampled grid:")
print(f"  Grid resolution: {len(lat_processed)} × {len(lon_processed)}")
print(f"  Unique training locations: {len(set(zip(snap_lat, snap_lon)))}")

# Show sample
print("\nSample track points:")
print(snapped_track.head())

Generated Aeolus track:
  Duration: 24 hours
  Sampling: 1 minute intervals
  Total points: 1441

Snapped to downsampled grid:
  Grid resolution: 73 × 144
  Unique training locations: 1424

Sample track points:
                       time   lat    lon
0 2019-01-01 09:00:00+00:00  40.0  165.0
1 2019-01-01 09:01:00+00:00  42.5  165.0
2 2019-01-01 09:02:00+00:00  47.5  162.5
3 2019-01-01 09:03:00+00:00  52.5  160.0
4 2019-01-01 09:04:00+00:00  57.5  160.0


In [5]:
# =============================================================================
# BUILD SPHERE GRID GRAPH
# =============================================================================

# Utilities for spherical coordinates and graph construction
def deg2rad(x):
    return np.deg2rad(x)

def sph2cart(lat_deg, lon_deg, r=1.0):
    """Convert geographic coords (degrees) to 3D unit-sphere (x,y,z)"""
    lat = deg2rad(lat_deg)
    lon = deg2rad(lon_deg)
    x = r * np.cos(lat) * np.cos(lon)
    y = r * np.cos(lat) * np.sin(lon)
    z = r * np.sin(lat)
    return np.stack([x, y, z], axis=-1)

def great_circle_distance(lat1_deg, lon1_deg, lat2_deg, lon2_deg, R=1.0):
    """Great-circle distance using the haversine formula"""
    lat1, lon1 = deg2rad(lat1_deg), deg2rad(lon1_deg)
    lat2, lon2 = deg2rad(lat2_deg), deg2rad(lon2_deg)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    return R * c

def grid_index(i, j, n_lat, n_lon):
    """Flatten (i,j) -> node id"""
    return i * n_lon + j

def inverse_grid_index(node_id, n_lat, n_lon):
    """Unflatten node id -> (i,j)"""
    i = node_id // n_lon
    j = node_id % n_lon
    return i, j

def build_sphere_grid_graph(lat, lon, connectivity=4, weight="geodesic", radius=1.0):
    """
    Build a mesh-graph over a regular lat-lon grid on the sphere.
    
    Returns:
    -------
    G : networkx.Graph
        Nodes [0..N_lat*N_lon-1] with attributes: lat, lon (degrees), xyz: (x,y,z) on unit sphere
    A : scipy.sparse.csr_matrix
        Weighted adjacency (symmetric)
    """
    n_lat = len(lat)
    n_lon = len(lon)

    # Precompute per-node attributes
    Lon_grid, Lat_grid = np.meshgrid(lon, lat)
    xyz = sph2cart(Lat_grid.ravel(), Lon_grid.ravel(), r=1.0)

    # Build edges
    rows = []
    cols = []
    data = []

    # 4-connected neighbors: north/south/east/west
    nbrs_4 = [(0, 1), (0, -1), (1, 0), (-1, 0)]

    for i in range(n_lat):
        for j in range(n_lon):
            nid = grid_index(i, j, n_lat, n_lon)
            for di, dj in nbrs_4:
                ii = i + di
                jj = (j + dj) % n_lon  # periodic in longitude
                if 0 <= ii < n_lat:
                    nid2 = grid_index(ii, jj, n_lat, n_lon)

                    if weight == "geodesic":
                        w = great_circle_distance(lat[i], lon[j], lat[ii], lon[jj], R=radius)
                    else:
                        w = 1.0

                    rows.append(nid)
                    cols.append(nid2)
                    data.append(w)

    # Create symmetric adjacency matrix
    A = sparse.coo_matrix((data, (rows, cols)), shape=(n_lat*n_lon, n_lat*n_lon))
    A = ((A + A.T) * 0.5).tocsr()

    # Build networkx graph from adjacency
    G = nx.from_scipy_sparse_array(A)
    
    # Attach node attributes
    node_attrs = {}
    for nid in range(n_lat * n_lon):
        i, j = inverse_grid_index(nid, n_lat, n_lon)
        node_attrs[nid] = {
            "lat": float(lat[i]),
            "lon": float(lon[j]),
            "xyz": tuple(xyz[nid]),
        }
    nx.set_node_attributes(G, node_attrs)

    return G, A

def nearest_node_indices_for_track(track_lat, track_lon, lat, lon):
    """Given a track of points (lat, lon), return nearest grid (i,j) for each"""
    lat = np.asarray(lat)
    lon = np.asarray(lon)
    track_lat = np.asarray(track_lat)
    track_lon = np.asarray(track_lon) % 360.0

    # nearest indices by absolute difference
    i_idx = np.abs(track_lat[:, None] - lat[None, :]).argmin(axis=1)
    j_idx = np.abs(track_lon[:, None] - lon[None, :]).argmin(axis=1)

    node_ids = i_idx * len(lon) + j_idx
    idx_ij = np.stack([i_idx, j_idx], axis=1)
    return idx_ij, node_ids

In [6]:
# Build graph using downsampled coordinates
G, A = build_sphere_grid_graph(lat_processed, lon_processed, connectivity=4, weight="geodesic", radius=1.0)

# Snap Aeolus track to the downsampled grid
idx_ij, node_ids = nearest_node_indices_for_track(
    track_lat=snapped_track["lat"].values,
    track_lon=snapped_track["lon"].values,
    lat=lat_processed, 
    lon=lon_processed,
)

# Get 3D coordinates for track visualization
xyz_track = np.array([G.nodes[n]["xyz"] for n in node_ids])

print(f"Graph Construction Results:")
print(f"  Number of nodes: {G.number_of_nodes():,}")
print(f"  Number of edges: {G.number_of_edges():,}")
print(f"  Adjacency matrix shape: {A.shape}")
print(f"  Matrix density: {A.nnz / (A.shape[0] * A.shape[1]):.6f}")
print(f"  Aeolus track points: {len(node_ids):,}")
print(f"  Unique training nodes: {len(np.unique(node_ids)):,}")

Graph Construction Results:
  Number of nodes: 10,512
  Number of edges: 20,880
  Adjacency matrix shape: (10512, 10512)
  Matrix density: 0.000378
  Aeolus track points: 1,441
  Unique training nodes: 1,424


In [7]:
# =============================================================================
# PREPARE FINAL DATASETS
# =============================================================================

print("Preparing final datasets for training...")

# Use the downsampled wind data
lat_grid = lat_processed
lon_grid = lon_processed  
u_component = u_500_processed
v_component = v_500_processed

# Create adjacency matrix A (already computed above as sparse CSR)
A = A.tocsr()  # Ensure CSR format

# Create node indices (consistent with graph construction)
n_lat, n_lon = len(lat_grid), len(lon_grid)
X = np.arange(n_lat * n_lon)  # Node indices [0, 1, 2, ..., n_nodes-1]

# Create wind speed targets with CONSISTENT indexing
y = np.zeros(n_lat * n_lon)
coord_mapping = np.zeros((n_lat * n_lon, 2))

for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j  # Same indexing as in graph construction
        
        # Extract wind components at this grid point
        u_val = u_component[i, j]
        v_val = v_component[i, j]
        wind_speed = np.sqrt(u_val**2 + v_val**2)
        
        # Assign to correct node index
        y[node_id] = wind_speed
        
        # Store coordinate mapping for reference
        coord_mapping[node_id, 0] = lat_grid[i]  # latitude
        coord_mapping[node_id, 1] = lon_grid[j]  # longitude

# Apply normalization (zero mean, unit variance)
y_raw = y.copy()  # Keep original for reference
y_mean = np.mean(y)
y_std = np.std(y)
y = (y - y_mean) / y_std  # Normalize

print(f"Wind speed normalization:")
print(f"  Original: mean={y_mean:.3f} m/s, std={y_std:.3f} m/s")
print(f"  Normalized: mean={np.mean(y):.6f}, std={np.std(y):.6f}")
print(f"  Range: [{np.min(y):.3f}, {np.max(y):.3f}]")

# Create training locations (Aeolus satellite track node indices)
unique_train_nodes = np.unique(node_ids)
X_train = unique_train_nodes  # Training node indices
y_train = y[X_train]  # Corresponding normalized wind speeds

print(f"\nDataset Statistics:")
print(f"  Total nodes: {len(X):,}")
print(f"  Training nodes: {len(X_train):,}")
print(f"  Training coverage: {len(X_train)/len(X)*100:.3f}%")

# Also save original u,v components with consistent indexing
u_flat_consistent = np.zeros(n_lat * n_lon)
v_flat_consistent = np.zeros(n_lat * n_lon)

for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j
        u_flat_consistent[node_id] = u_component[i, j]
        v_flat_consistent[node_id] = v_component[i, j]

# Save the prepared datasets
np.savez(f'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz',
         A_data=A.data,
         A_indices=A.indices, 
         A_indptr=A.indptr,
         A_shape=A.shape,
         X=X,
         y=y,  # Normalized wind speeds
         y_mean=y_mean,  # Normalization parameters
         y_std=y_std,
         X_train=X_train,
         y_train=y_train,
         coord_mapping=coord_mapping,
         u_component=u_flat_consistent,
         v_component=v_flat_consistent,
         downsample_factor=DOWNSAMPLE_FACTOR,
         pressure_level=PRESSURE_LEVEL)

print(f"\n✅ All datasets saved to 'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz'")
print(f"✅ Downsampling: {DOWNSAMPLE_FACTOR}x reduction")
print(f"✅ Grid size: {n_lat} × {n_lon} = {n_lat * n_lon:,} nodes")
print(f"✅ Training points: {len(X_train):,}")
print(f"✅ Data normalized: mean=0, std=1")

Preparing final datasets for training...
Wind speed normalization:
  Original: mean=4.096 m/s, std=2.860 m/s
  Normalized: mean=-0.000000, std=1.000000
  Range: [-1.432, 3.164]

Dataset Statistics:
  Total nodes: 10,512
  Training nodes: 1,424
  Training coverage: 13.546%

✅ All datasets saved to 'wind_data_processed_500hPa_wide.npz'
✅ Downsampling: 10x reduction
✅ Grid size: 73 × 144 = 10,512 nodes
✅ Training points: 1,424
✅ Data normalized: mean=0, std=1

✅ All datasets saved to 'wind_data_processed_500hPa_wide.npz'
✅ Downsampling: 10x reduction
✅ Grid size: 73 × 144 = 10,512 nodes
✅ Training points: 1,424
✅ Data normalized: mean=0, std=1
