In [14]:
import torch
import numpy as np
from pathlib import Path
import xarray as xr

era5_dir = Path("/data/jiyilun/typhoon/download/era5")
corrupt_files = ["2011091712", "2012092518", "2012100800", "2014071112", "2014101000"]
for era5_file in [era5_dir / f"{hour}.nc" for hour in corrupt_files]:
    right = True
    era5_hour = xr.open_dataset(era5_file)
    for var in era5_hour.data_vars:
        if np.isnan(era5_hour[var].values).any():
            print(f"NaN found in {era5_file.stem}.{var}")
            print(f"NaN proportion: {np.isnan(era5_hour[var].values).mean()}")
            right = False
    if right:
        print(f"File {era5_file.stem} is OK.")
    else:
        print(f"File {era5_file.stem} is corrupt.")
        corrupt_files.append(era5_file)
print(corrupt_files)


File 2011091712 is OK.
File 2012092518 is OK.
File 2012100800 is OK.
File 2014071112 is OK.
File 2014101000 is OK.
['2011091712', '2012092518', '2012100800', '2014071112', '2014101000']


In [2]:
import xarray as xr
import pandas as pd
from datetime import datetime
from pathlib import Path
era5_dir = Path("/data/jiyilun/typhoon/download/era5")
url = "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721_with_derived_variables.zarr"
era5_ds = xr.open_zarr(url)

In [12]:
time_str = "2014101000"
var_names = [
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "2m_temperature",
    "mean_sea_level_pressure",
    "temperature",
    "u_component_of_wind",
    "v_component_of_wind",
    "specific_humidity",
    "geopotential",
]

ds_hour = era5_ds[var_names].sel(
    time=pd.to_datetime(time_str, format="%Y%m%d%H")
).compute()

In [13]:
ds_hour.to_netcdf(era5_dir / f"{time_str}.nc")

In [53]:
var_names = [
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "2m_temperature",
    "mean_sea_level_pressure",
    "temperature",
    "u_component_of_wind",
    "v_component_of_wind",
    "specific_humidity",
    "geopotential",
]
print(era5_ds["geopotential"].sel(
        time=datetime(2018, 8, 5, 0)
    ).isel(
        longitude=slice(400, 720),
        latitude=slice(121, 361)
    ).values - era5_ds_hour["geopotential"].values)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
