# Wind Data Loading and Preprocessing - 800hPa (Downsampled)

This notebook loads and preprocesses ERA5 wind data at 800hPa pressure level with **10x downsampling** for efficient graph-based wind interpolation. The downsampling reduces computational load from ~1M nodes to ~10K nodes while maintaining essential spatial patterns.

**Key Features:**
- 10x downsampling in both lat/lon directions (721×1440 → 73×144 grid)
- Consistent node indexing for graph-based methods
- Normalized wind speeds (mean=0, std=1) 
- Aeolus satellite track for training data
- Sparse adjacency matrix with geodesic edge weights

In [1]:
# =============================================================================
# IMPORT LIBRARIES AND CONFIGURATION
# =============================================================================

from netCDF4 import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from datetime import datetime, timedelta
from skyfield.api import load, EarthSatellite, wgs84, utc

# Configuration parameters
DOWNSAMPLE_FACTOR = 10      # Downsample by factor of 10 in both lat/lon directions
PRESSURE_LEVEL = 800        # hPa pressure level
PRESSURE_INDEX = 1          # 800hPa is index 1 in the dataset
DATA_FILE = '../8176c14c59fd8dc32a74a89b926cb7fd.nc'

print(f"Configuration:")
print(f"  Pressure Level: {PRESSURE_LEVEL} hPa")
print(f"  Downsampling Factor: {DOWNSAMPLE_FACTOR}x")
print(f"  Expected grid reduction: ~100x fewer nodes")

Configuration:
  Pressure Level: 800 hPa
  Downsampling Factor: 10x
  Expected grid reduction: ~100x fewer nodes


In [2]:
# =============================================================================
# LOAD NETCDF WIND DATA
# =============================================================================

# Load the NetCDF dataset
dataset = Dataset(DATA_FILE, mode="r")
print("Available variables:", list(dataset.variables.keys()))

# Load coordinate arrays
lat = dataset.variables["latitude"][:]      # shape (721,)
lon = dataset.variables["longitude"][:]     # shape (1440,)

# Load wind components (eastward and northward)
# Dimensions: (valid_time=12, pressure_level=3, latitude=721, longitude=1440)
u = dataset.variables["u"][:]   # eastward wind
v = dataset.variables["v"][:]   # northward wind

# Extract wind components for 800hPa pressure level
u_800 = u[0, PRESSURE_INDEX, :, :]  # shape (721, 1440)
v_800 = v[0, PRESSURE_INDEX, :, :]  # shape (721, 1440)

print(f"\nData shapes:")
print(f"  Latitude: {lat.shape}")
print(f"  Longitude: {lon.shape}")
print(f"  U component: {u_800.shape}")
print(f"  V component: {v_800.shape}")
print(f"  Original grid size: {lat.shape[0]} × {lon.shape[0]} = {lat.shape[0] * lon.shape[0]:,} points")

Available variables: ['number', 'valid_time', 'pressure_level', 'latitude', 'longitude', 'expver', 'u', 'v']

Data shapes:
  Latitude: (721,)
  Longitude: (1440,)
  U component: (721, 1440)
  V component: (721, 1440)
  Original grid size: 721 × 1440 = 1,038,240 points

Data shapes:
  Latitude: (721,)
  Longitude: (1440,)
  U component: (721, 1440)
  V component: (721, 1440)
  Original grid size: 721 × 1440 = 1,038,240 points


In [3]:
# =============================================================================
# APPLY DOWNSAMPLING
# =============================================================================

def downsample_grid_data(lat, lon, u_data, v_data, factor=10):
    """Downsample the lat/lon grid and corresponding data by a given factor"""
    lat_down = lat[::factor]
    lon_down = lon[::factor]
    u_down = u_data[::factor, ::factor]
    v_down = v_data[::factor, ::factor]
    
    print(f"Downsampling Results:")
    print(f"  Original grid: {lat.shape[0]} × {lon.shape[0]} = {len(lat) * len(lon):,} points")
    print(f"  Downsampled grid: {lat_down.shape[0]} × {lon_down.shape[0]} = {len(lat_down) * len(lon_down):,} points")
    print(f"  Reduction factor: {(len(lat) * len(lon)) / (len(lat_down) * len(lon_down)):.1f}x smaller")
    print(f"  Resolution: {(lat_down[1] - lat_down[0]):.2f}° lat × {(lon_down[1] - lon_down[0]):.2f}° lon")
    
    return lat_down, lon_down, u_down, v_down

# Apply downsampling
lat_processed, lon_processed, u_800_processed, v_800_processed = downsample_grid_data(
    lat, lon, u_800, v_800, factor=DOWNSAMPLE_FACTOR
)

# Calculate wind speed magnitude for the downsampled data
wind_speed_processed = np.sqrt(u_800_processed**2 + v_800_processed**2)

# Save the downsampled data
np.savez(f'wind_data_{PRESSURE_LEVEL}hPa_downsampled.npz', 
         lon=lon_processed, lat=lat_processed, 
         u=u_800_processed, v=v_800_processed, wind_speed=wind_speed_processed)

Downsampling Results:
  Original grid: 721 × 1440 = 1,038,240 points
  Downsampled grid: 73 × 144 = 10,512 points
  Reduction factor: 98.8x smaller
  Resolution: -2.50° lat × 2.50° lon


In [4]:
# =============================================================================
# GENERATE AEOLUS SATELLITE TRACK & BUILD GRAPH & PREPARE DATASETS
# =============================================================================

# Define Aeolus TLE data and generate track
line1 = "1 43600U 18066A   21153.73585495  .00031128  00000-0  12124-3 0  9990"
line2 = "2 43600  96.7150 160.8035 0006915  90.4181 269.7884 15.87015039160910"

ts = load.timescale()
aeolus = EarthSatellite(line1, line2, "AEOLUS", ts)

start = datetime(2019, 1, 1, 9, tzinfo=utc)
times = [start + timedelta(minutes=i) for i in range(1440)]  # 24h, 1min intervals

geocentric = aeolus.at(ts.from_datetimes(times))
sat_lat, sat_lon = wgs84.latlon_of(geocentric)
sat_lat, sat_lon = sat_lat.degrees, sat_lon.degrees % 360

# Snap to downsampled grid
def snap_to_grid(lat_val, lon_val, era5_lat, era5_lon):
    i = np.abs(era5_lat - lat_val).argmin()
    j = np.abs(era5_lon - lon_val).argmin()
    return era5_lat[i], era5_lon[j]

snapped = [snap_to_grid(phi, lam, lat_processed, lon_processed) for phi, lam in zip(sat_lat, sat_lon)]
snap_lat, snap_lon = zip(*snapped)
snapped_track = pd.DataFrame({"time": times, "lat": snap_lat, "lon": snap_lon})

print(f"Generated Aeolus track: {len(times)} points, {len(set(zip(snap_lat, snap_lon)))} unique locations")

# Build sphere grid graph utilities
def deg2rad(x): return np.deg2rad(x)
def sph2cart(lat_deg, lon_deg, r=1.0):
    lat, lon = deg2rad(lat_deg), deg2rad(lon_deg)
    x = r * np.cos(lat) * np.cos(lon)
    y = r * np.cos(lat) * np.sin(lon)  
    z = r * np.sin(lat)
    return np.stack([x, y, z], axis=-1)

def great_circle_distance(lat1_deg, lon1_deg, lat2_deg, lon2_deg, R=1.0):
    lat1, lon1 = deg2rad(lat1_deg), deg2rad(lon1_deg)
    lat2, lon2 = deg2rad(lat2_deg), deg2rad(lon2_deg)
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    return R * c

def grid_index(i, j, n_lat, n_lon): return i * n_lon + j

def build_sphere_grid_graph(lat, lon):
    n_lat, n_lon = len(lat), len(lon)
    Lon_grid, Lat_grid = np.meshgrid(lon, lat)
    
    rows, cols, data = [], [], []
    for i in range(n_lat):
        for j in range(n_lon):
            nid = grid_index(i, j, n_lat, n_lon)
            for di, dj in [(0, 1), (0, -1), (1, 0), (-1, 0)]:
                ii, jj = i + di, (j + dj) % n_lon
                if 0 <= ii < n_lat:
                    nid2 = grid_index(ii, jj, n_lat, n_lon)
                    w = great_circle_distance(lat[i], lon[j], lat[ii], lon[jj])
                    rows.append(nid)
                    cols.append(nid2)
                    data.append(w)
    
    A = sparse.coo_matrix((data, (rows, cols)), shape=(n_lat*n_lon, n_lat*n_lon))
    A = ((A + A.T) * 0.5).tocsr()
    return nx.from_scipy_sparse_array(A), A

def nearest_node_indices_for_track(track_lat, track_lon, lat, lon):
    lat, lon = np.asarray(lat), np.asarray(lon)
    track_lat, track_lon = np.asarray(track_lat), np.asarray(track_lon) % 360.0
    i_idx = np.abs(track_lat[:, None] - lat[None, :]).argmin(axis=1)
    j_idx = np.abs(track_lon[:, None] - lon[None, :]).argmin(axis=1)
    return i_idx * len(lon) + j_idx

# Build graph and get training nodes
G, A = build_sphere_grid_graph(lat_processed, lon_processed)
node_ids = nearest_node_indices_for_track(snapped_track["lat"].values, snapped_track["lon"].values, lat_processed, lon_processed)

print(f"Graph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")
print(f"Training: {len(node_ids):,} points, {len(np.unique(node_ids)):,} unique nodes")

# Prepare final datasets
n_lat, n_lon = len(lat_processed), len(lon_processed)
X = np.arange(n_lat * n_lon)
y = np.zeros(n_lat * n_lon)
coord_mapping = np.zeros((n_lat * n_lon, 2))

for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j
        u_val, v_val = u_800_processed[i, j], v_800_processed[i, j]
        y[node_id] = np.sqrt(u_val**2 + v_val**2)
        coord_mapping[node_id] = [lat_processed[i], lon_processed[j]]

# Normalize wind speeds
y_mean, y_std = np.mean(y), np.std(y)
y = (y - y_mean) / y_std

# Training data
X_train = np.unique(node_ids)
y_train = y[X_train]

print(f"Wind normalization: mean={y_mean:.3f} m/s, std={y_std:.3f} m/s")
print(f"Dataset: {len(X):,} total nodes, {len(X_train):,} training nodes ({len(X_train)/len(X)*100:.3f}%)")

# Save prepared datasets
u_flat = np.zeros(n_lat * n_lon)
v_flat = np.zeros(n_lat * n_lon)
for i in range(n_lat):
    for j in range(n_lon):
        node_id = i * n_lon + j
        u_flat[node_id] = u_800_processed[i, j]
        v_flat[node_id] = v_800_processed[i, j]

np.savez(f'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz',
         A_data=A.data, A_indices=A.indices, A_indptr=A.indptr, A_shape=A.shape,
         X=X, y=y, y_mean=y_mean, y_std=y_std, X_train=X_train, y_train=y_train,
         coord_mapping=coord_mapping, u_component=u_flat, v_component=v_flat,
         downsample_factor=DOWNSAMPLE_FACTOR, pressure_level=PRESSURE_LEVEL)

print(f"\n✅ All datasets saved to 'wind_data_processed_{PRESSURE_LEVEL}hPa_wide.npz'")
print(f"✅ Grid: {n_lat}×{n_lon} = {n_lat*n_lon:,} nodes (downsampled {DOWNSAMPLE_FACTOR}x)")
print(f"✅ Normalized data: mean=0, std=1")

Generated Aeolus track: 1440 points, 1423 unique locations
Graph: 10,512 nodes, 20,880 edges
Training: 1,440 points, 1,423 unique nodes
Wind normalization: mean=5.717 m/s, std=4.060 m/s
Dataset: 10,512 total nodes, 1,423 training nodes (13.537%)
Graph: 10,512 nodes, 20,880 edges
Training: 1,440 points, 1,423 unique nodes
Wind normalization: mean=5.717 m/s, std=4.060 m/s
Dataset: 10,512 total nodes, 1,423 training nodes (13.537%)

✅ All datasets saved to 'wind_data_processed_800hPa_wide.npz'
✅ Grid: 73×144 = 10,512 nodes (downsampled 10x)
✅ Normalized data: mean=0, std=1

✅ All datasets saved to 'wind_data_processed_800hPa_wide.npz'
✅ Grid: 73×144 = 10,512 nodes (downsampled 10x)
✅ Normalized data: mean=0, std=1
