# Wind Data Loading and Preprocessing - 1000hPa (Downsampled)

This notebook loads and preprocesses ERA5 wind data at 1000hPa pressure level with **10x downsampling** for efficient graph-based wind interpolation. The downsampling reduces computational load from ~1M nodes to ~10K nodes while maintaining essential spatial patterns.

**Key Features:**
- 10x downsampling in both lat/lon directions (721×1440 → 73×144 grid)
- Consistent node indexing for graph-based methods
- Normalized wind speeds (mean=0, std=1) 
- Aeolus satellite track for training data
- Sparse adjacency matrix with geodesic edge weights

In [1]:
# =============================================================================
# IMPORT LIBRARIES AND CONFIGURATION
# =============================================================================

from netCDF4 import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from datetime import datetime, timedelta
from skyfield.api import load, EarthSatellite, wgs84, utc

# Configuration parameters
DOWNSAMPLE_FACTOR = 10      # Downsample by factor of 10 in both lat/lon directions
PRESSURE_LEVEL = 1000       # hPa pressure level
PRESSURE_INDEX = 2          # 1000hPa is index 2 in the dataset
DATA_FILE = '../8176c14c59fd8dc32a74a89b926cb7fd.nc'

print(f"Configuration:")
print(f"  Pressure Level: {PRESSURE_LEVEL} hPa")
print(f"  Downsampling Factor: {DOWNSAMPLE_FACTOR}x")
print(f"  Expected grid reduction: ~100x fewer nodes")

Configuration:
  Pressure Level: 1000 hPa
  Downsampling Factor: 10x
  Expected grid reduction: ~100x fewer nodes


In [2]:
# =============================================================================
# LOAD NETCDF DATA, DOWNSAMPLE, GENERATE TRACK, BUILD GRAPH & PREPARE DATASETS
# =============================================================================

# Load NetCDF dataset
dataset = Dataset(DATA_FILE, mode="r")
print("Available variables:", list(dataset.variables.keys()))

# Load coordinates and wind data
lat = dataset.variables["latitude"][:]
lon = dataset.variables["longitude"][:]
u = dataset.variables["u"][:]
v = dataset.variables["v"][:]

# Extract 1000hPa wind components
u_1000 = u[0, PRESSURE_INDEX, :, :]
v_1000 = v[0, PRESSURE_INDEX, :, :]

print(f"Original data: {lat.shape[0]}×{lon.shape[0]} = {lat.shape[0]*lon.shape[0]:,} points")

# Downsample by factor of 10
lat_processed = lat[::DOWNSAMPLE_FACTOR]
lon_processed = lon[::DOWNSAMPLE_FACTOR]
u_1000_processed = u_1000[::DOWNSAMPLE_FACTOR, ::DOWNSAMPLE_FACTOR]
v_1000_processed = v_1000[::DOWNSAMPLE_FACTOR, ::DOWNSAMPLE_FACTOR]

print(f"Downsampled: {lat_processed.shape[0]}×{lon_processed.shape[0]} = {len(lat_processed)*len(lon_processed):,} points")
print(f"Reduction: {(len(lat)*len(lon))/(len(lat_processed)*len(lon_processed)):.1f}x smaller")
print(f"Resolution: {lat_processed[1]-lat_processed[0]:.2f}° × {lon_processed[1]-lon_processed[0]:.2f}°")

# Generate Aeolus satellite track
line1 = "1 43600U 18066A   21153.73585495  .00031128  00000-0  12124-3 0  9990"
line2 = "2 43600  96.7150 160.8035 0006915  90.4181 269.7884 15.87015039160910"

ts = load.timescale()
aeolus = EarthSatellite(line1, line2, "AEOLUS", ts)

# 24 hours of satellite positions (1-minute intervals)
start = datetime(2019, 1, 1, 9, tzinfo=utc)
times = [start + timedelta(minutes=i) for i in range(1440)]

geocentric = aeolus.at(ts.from_datetimes(times))
sat_lat, sat_lon = wgs84.latlon_of(geocentric)
sat_lat, sat_lon = sat_lat.degrees, sat_lon.degrees % 360

# Snap satellite track to downsampled grid
def snap_to_grid(lat_val, lon_val, era5_lat, era5_lon):
    i = np.abs(era5_lat - lat_val).argmin()
    j = np.abs(era5_lon - lon_val).argmin()
    return era5_lat[i], era5_lon[j]

snapped = [snap_to_grid(phi, lam, lat_processed, lon_processed) for phi, lam in zip(sat_lat, sat_lon)]
snap_lat, snap_lon = zip(*snapped)
snapped_track = pd.DataFrame({"time": times, "lat": snap_lat, "lon": snap_lon})

print(f"Satellite track: {len(times)} points → {len(set(zip(snap_lat, snap_lon)))} unique grid locations")

# Graph construction utilities
def deg2rad(x): return np.deg2rad(x)

def great_circle_distance(lat1_deg, lon1_deg, lat2_deg, lon2_deg, R=1.0):
    lat1, lon1 = deg2rad(lat1_deg), deg2rad(lon1_deg)
    lat2, lon2 = deg2rad(lat2_deg), deg2rad(lon2_deg)
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    return R * c

def grid_index(i, j, n_lat, n_lon): 
    return i * n_lon + j

def build_sphere_grid_graph(lat, lon):
    """Build 4-connected sphere grid graph with geodesic edge weights"""
    n_lat, n_lon = len(lat), len(lon)
    
    rows, cols, data = [], [], []
    # 4-connected neighbors
    neighbors = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # E, W, S, N
    
    for i in range(n_lat):
        for j in range(n_lon):
            nid = grid_index(i, j, n_lat, n_lon)
            for di, dj in neighbors:
                ii = i + di
                jj = (j + dj) % n_lon  # periodic in longitude
                if 0 <= ii < n_lat:
                    nid2 = grid_index(ii, jj, n_lat, n_lon)
                    w = great_circle_distance(lat[i], lon[j], lat[ii], lon[jj])
                    rows.extend([nid, nid2])
                    cols.extend([nid2, nid])  # symmetric
                    data.extend([w, w])
    
    A = sparse.coo_matrix((data, (rows, cols)), shape=(n_lat*n_lon, n_lat*n_lon))
    A = A.tocsr()
    A.eliminate_zeros()
    return nx.from_scipy_sparse_array(A), A

def nearest_node_indices_for_track(track_lat, track_lon, lat, lon):
    """Map satellite track points to graph node indices"""
    lat, lon = np.asarray(lat), np.asarray(lon)
    track_lat, track_lon = np.asarray(track_lat), np.asarray(track_lon) % 360.0
    
    i_idx = np.abs(track_lat[:, None] - lat[None, :]).argmin(axis=1)
    j_idx = np.abs(track_lon[:, None] - lon[None, :]).argmin(axis=1)
    
    return i_idx * len(lon) + j_idx

# Build graph
G, A = build_sphere_grid_graph(lat_processed, lon_processed)
node_ids = nearest_node_indices_for_track(
    snapped_track["lat"].values, 
    snapped_track["lon"].values,
    lat_processed, 
    lon_processed
)

print(f"Graph constructed: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")
print(f"Matrix density: {A.nnz / (A.shape[0] * A.shape[1]):.6f}")

# Prepare final datasets with consistent indexing
n_lat, n_lon = len(lat_processed), len(lon_processed)
X = np.arange(n_lat * n_lon)
y = np.zeros(n_lat * n_lon)
coord_mapping = np.zeros((n_lat * n_lon, 2))

# Flatten wind data with consistent node indexing
for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j
        u_val = u_1000_processed[i, j]
        v_val = v_1000_processed[i, j]
        y[node_id] = np.sqrt(u_val**2 + v_val**2)  # Wind speed magnitude
        coord_mapping[node_id] = [lat_processed[i], lon_processed[j]]

# Normalize wind speeds (zero mean, unit variance)
y_mean = np.mean(y)
y_std = np.std(y)
y = (y - y_mean) / y_std

# Create training dataset from satellite track
X_train = np.unique(node_ids)
y_train = y[X_train]

print(f"Wind speed normalization: mean={y_mean:.3f} m/s, std={y_std:.3f} m/s")
print(f"Final dataset: {len(X):,} nodes, {len(X_train):,} training ({len(X_train)/len(X)*100:.3f}%)")

# Save u,v components with consistent indexing
u_flat = np.zeros(n_lat * n_lon)
v_flat = np.zeros(n_lat * n_lon)

for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j
        u_flat[node_id] = u_1000_processed[i, j]
        v_flat[node_id] = v_1000_processed[i, j]

# Save processed datasets
np.savez(f'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz',
         A_data=A.data,
         A_indices=A.indices, 
         A_indptr=A.indptr,
         A_shape=A.shape,
         X=X,
         y=y,  # Normalized wind speeds
         y_mean=y_mean,
         y_std=y_std,
         X_train=X_train,
         y_train=y_train,
         coord_mapping=coord_mapping,
         u_component=u_flat,
         v_component=v_flat,
         downsample_factor=DOWNSAMPLE_FACTOR,
         pressure_level=PRESSURE_LEVEL)

print(f"\n✅ SUCCESS: All datasets saved to 'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz'")
print(f"✅ Downsampling: {DOWNSAMPLE_FACTOR}x reduction ({len(lat)*len(lon):,} → {n_lat*n_lon:,} nodes)")
print(f"✅ Graph: {n_lat}×{n_lon} grid, 4-connected with geodesic weights") 
print(f"✅ Training: {len(X_train):,} nodes from Aeolus satellite track")
print(f"✅ Data: normalized wind speeds (mean=0, std=1)")

Available variables: ['number', 'valid_time', 'pressure_level', 'latitude', 'longitude', 'expver', 'u', 'v']
Original data: 721×1440 = 1,038,240 points
Downsampled: 73×144 = 10,512 points
Reduction: 98.8x smaller
Resolution: -2.50° × 2.50°
Satellite track: 1440 points → 1423 unique grid locations
Original data: 721×1440 = 1,038,240 points
Downsampled: 73×144 = 10,512 points
Reduction: 98.8x smaller
Resolution: -2.50° × 2.50°
Satellite track: 1440 points → 1423 unique grid locations
Graph constructed: 10,512 nodes, 20,880 edges
Matrix density: 0.000378
Wind speed normalization: mean=9.359 m/s, std=7.824 m/s
Final dataset: 10,512 nodes, 1,423 training (13.537%)
Graph constructed: 10,512 nodes, 20,880 edges
Matrix density: 0.000378
Wind speed normalization: mean=9.359 m/s, std=7.824 m/s
Final dataset: 10,512 nodes, 1,423 training (13.537%)

✅ SUCCESS: All datasets saved to 'wind_data_processed_1000hPa_wide.npz'
✅ Downsampling: 10x reduction (1,038,240 → 10,512 nodes)
✅ Graph: 73×144 grid,