# ERA5 Data Preprocessing

### Imports

In [1]:
import xarray as xr
import numpy as np
import cdsapi
import zipfile
import os

### Download Dataset with API
**Requirement:** Setup the cdsapi with your account as described here: https://cds.climate.copernicus.eu/how-to-api


In [2]:
dataset_name = "era5-land-monthly-means-1992-2020"
target_zip_file = f'data/era5_climate_data/{dataset_name}.zip'

1. Make the API call. If the internet connection is unstable the download tends to fail through the API, in that case it is recommended to download the dataset manually from the website. In that case place the downloaded zip under [data/era5_climate_data/](data/era5_climate_data/) and continue with step 2.

In [None]:
dataset = "reanalysis-era5-land-monthly-means"
request = {
    "product_type": ["monthly_averaged_reanalysis"],
    "variable": [
        "2m_dewpoint_temperature",
        "2m_temperature",
        "soil_temperature_level_1",
        "soil_temperature_level_4",
        "snow_cover",
        "snow_density",
        "volumetric_soil_water_layer_1",
        "volumetric_soil_water_layer_4",
        "total_precipitation",
        "soil_type"
    ],
    "year": [
        "1992", "1993", "1994", "1995", "1996", 
        "1997", "1998", "1999", "2000", "2001", 
        "2002", "2003", "2004", "2005", "2006", 
        "2007", "2008", "2009", "2010", "2011", 
        "2012", "2013", "2014", "2015", "2016", 
        "2017", "2018", "2019", "2020"
    ],
    "month": [
        "01", "02", "03", "04", "05", "06",
        "07", "08", "09", "10", "11", "12"
    ],
    "time": ["00:00"],
    "data_format": "netcdf",
    "download_format": "zip"
}

client = cdsapi.Client()
result = client.retrieve(dataset, request).download(target_zip_file)

2. Unzip the dataset and extract the relevant file

In [None]:
with zipfile.ZipFile(target_zip_file, "r") as zip_ref:
    file_list = zip_ref.namelist()
    data_file = next(f for f in file_list if "data" in f)
    zip_ref.extract(data_file, "data/era5_climate_data/")
    os.rename(f"data/era5_climate_data/{data_file}", f"data/era5_climate_data/{dataset_name}.nc")

### Load Dataset
Load the dataset as an xarray.Dataset

In [3]:
ds_era5_global = xr.open_dataset(f"data/era5_climate_data/{dataset_name}.nc")

### Slicing the Dataset
To make the analysis feasible I will focus on a "slice" of the earth that includes Europe and parts of North Africa. 

First we have to convert the longitude dimension of the dataset to run from -180 to 180 instead of 0 to 360.

In [4]:
def to_lon180(ds, lon_name='longitude'):
    lon = ds[lon_name]
    lon180 = ((lon + 180) % 360) - 180
    return ds.assign_coords({lon_name: lon180}).sortby(lon_name)

ds_era5_global = to_lon180(ds_era5_global)

Now we slice from 11°W to 40°E and from 30°N to 72°N. Which converts to longitude=[-11, 40] and latitude=[72, 30]

*Note: This step reduces the dataset from roughly 81GB to 3GB!*

In [5]:
ds_era5_sliced = ds_era5_global.sel(longitude=slice(-11, 40), latitude=slice(72, 30))

### Dataset Information
**Variables:**

1. "t2m" --> 2 metre temperature in Kelvin
2. "d2m" --> 2 metre dewpoint temperature in Kelvin
3. "stl1" --> Soil temperature level 1 in Kelvin
4. "stl4" --> Soil temperature level 4 in Kelvin
5. "snowc" --> Snow cover in %
6. "rsn" --> Snow density in kg/m³
7. "swvl1" --> Volumetric soil water layer 1 in m³/m³
8. "swvl4" --> Volumetric soil water layer 4 in m³/m³
9. "tp" --> Total precipation in m


In [6]:
print(ds_era5_sliced)

<xarray.Dataset> Size: 3GB
Dimensions:     (valid_time: 348, latitude: 420, longitude: 510)
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 3kB 1992-01-01 ... 2020-12-01
  * latitude    (latitude) float64 3kB 71.9 71.8 71.7 71.6 ... 30.2 30.1 30.0
    expver      (valid_time) <U4 6kB ...
  * longitude   (longitude) float64 4kB -11.0 -10.9 -10.8 ... 39.7 39.8 39.9
Data variables:
    d2m         (valid_time, latitude, longitude) float32 298MB ...
    t2m         (valid_time, latitude, longitude) float32 298MB ...
    stl1        (valid_time, latitude, longitude) float32 298MB ...
    stl4        (valid_time, latitude, longitude) float32 298MB ...
    snowc       (valid_time, latitude, longitude) float32 298MB ...
    rsn         (valid_time, latitude, longitude) float32 298MB ...
    swvl1       (valid_time, latitude, longitude) float32 298MB ...
    swvl4       (valid_time, latitude, longitude) float32 298MB ...
    tp          (valid_time, latitud

### Resampling to yearly data
Since the PFT dataset only has yearly data, the climate data has to be resampled from monthly means to a yearly timescale. To not loose too much information by only having the yearly mean temperature, seasonal means are computed as well. 

The variable dewpoint temperature is not that useful on its own, so the Vapor Pressure Deficit (VPD) is calculated from it and included in the dataset.

When aggregating the monthly data to yearly data the average has to be weighted by the amount of days in the month to get an exact yearly mean.

##### Helper Functions

In [6]:
def time_weights_days_in_month(ds):
    """
    Returns a DataArray of weights -> number of days in each month.
    """
    # dt.days_in_month exists for both numpy and cftime time indexes in recent xarray/cftime
    w = xr.DataArray(ds['time'].dt.days_in_month, coords={'time': ds['time']}, dims='time')
    w.name = 'days_in_month'
    return w

def resample_time_weighted_mean(ds, freq, weights):
    """
    Days-weighted mean over resample bins.

    ds: Dataset or DataArray with time dimension
    weights: 1D DataArray over 'time'
    """
    def _group_mean(x):
        w = weights.sel(time=x.time)
        return x.weighted(w).mean('time')

    return ds.resample(time=freq).map(_group_mean)

def calculate_vpd(temp_kelvin, temp_dewpoint_kelvin):
    """
    Vapor pressure deficit (VPD) from air temperature (K) and dewpoint (K).
    Uses Tetens formula.

    es(T)  = 6.112 * exp(17.67 * Tc / (Tc + 243.5))     [hPa]
    ea(Td) = 6.112 * exp(17.67 * Tdc / (Tdc + 243.5))   [hPa]
    VPD    = (es - ea) / 10                             [hPa]
    """
    # Convert K to °C
    temp_degrees = temp_kelvin - 273.15
    temp_dewpoint_degrees = temp_dewpoint_kelvin - 273.15

    # Tetens formula
    es_hPa = 6.1078 * np.exp(17.269 * temp_degrees / (temp_degrees + 237.3))
    ea_hPa = 6.1078 * np.exp(17.269 * temp_dewpoint_degrees / (temp_dewpoint_degrees + 237.3))

    # Vapor Pressure Deficit es - ea in hPa
    vpd_kPa = (es_hPa - ea_hPa)
    return vpd_kPa

Use the helper functions to first calculate the VPD and add it to the existing dataset. Afterwards define which variables should be averaged when aggregating to yearly data and which variables to sum. Total precipation (tp) should be summed instead of averaged to represent the total precipation in a year instead of the average monthly precipation in a year.

In the end the datasets are merged and the time dimension is now represented by an int for the year to match the PFT dataset.

This takes a considerable amount of time to run.

In [None]:
ds_era5_sliced = ds_era5_sliced.rename({'valid_time': 'time'})

ds_era5_sliced['vpd'] = calculate_vpd(temp_kelvin=ds_era5_sliced['t2m'], temp_dewpoint_kelvin=ds_era5_sliced['d2m'])
ds_era5_sliced['vpd'].attrs.update(units='hPa', long_name='Vapor pressure deficit')

ds_mean = ds_era5_sliced[['t2m','stl1','stl4','snowc','rsn','swvl1','swvl4','vpd']] 
ds_sum = ds_era5_sliced[['tp']]

time_weights = time_weights_days_in_month(ds_era5_sliced)

yearly_means = resample_time_weighted_mean(ds_mean, 'YS', time_weights)
yearly_sums = ds_sum.resample(time='YS').sum(skipna=True, min_count=1)

ds_era5_sliced_yearly = xr.merge([yearly_means, yearly_sums])
ds_era5_sliced_yearly = ds_era5_sliced_yearly\
                            .assign_coords(year=ds_era5_sliced_yearly['time'].dt.year)\
                            .swap_dims({'time':'year'}).drop_vars('time')

ds_era5_sliced_yearly.to_netcdf("data/era5_climate_data/era5-land-yearly-means-sliced.nc")

In [7]:
ds_era5_sliced = ds_era5_sliced.rename({'valid_time': 'time'})

ds_era5_sliced['vpd'] = calculate_vpd(temp_kelvin=ds_era5_sliced['t2m'], temp_dewpoint_kelvin=ds_era5_sliced['d2m'])
ds_era5_sliced['vpd'].attrs.update(units='hPa', long_name='Vapor pressure deficit')

ds_mean = ds_era5_sliced[['stl1','stl4','snowc','rsn','swvl1','swvl4','vpd']] 
ds_sum = ds_era5_sliced[['tp']]

time_weights = time_weights_days_in_month(ds_era5_sliced)

seas_means = resample_time_weighted_mean(ds_mean, 'QS-DEC', time_weights)
seas_sums  = ds_sum.resample(time='QS-DEC').sum(skipna=True, min_count=1)

In [15]:
print(ds_era5_sliced_seas)

<xarray.Dataset> Size: 2GB
Dimensions:    (time: 117, latitude: 420, longitude: 510)
Coordinates:
    number     int64 8B 0
  * latitude   (latitude) float64 3kB 71.9 71.8 71.7 71.6 ... 30.2 30.1 30.0
  * longitude  (longitude) float64 4kB -11.0 -10.9 -10.8 ... 39.7 39.8 39.9
  * time       (time) datetime64[ns] 936B 1991-12-01 1992-03-01 ... 2020-12-01
Data variables:
    stl1       (time, latitude, longitude) float64 200MB nan nan ... 284.9 284.9
    stl4       (time, latitude, longitude) float64 200MB nan nan ... 298.5 298.7
    snowc      (time, latitude, longitude) float64 200MB nan nan nan ... 0.0 0.0
    rsn        (time, latitude, longitude) float64 200MB nan nan ... 100.0 100.0
    swvl1      (time, latitude, longitude) float64 200MB nan nan ... 0.02919
    swvl4      (time, latitude, longitude) float64 200MB nan nan ... 0.00267
    vpd        (time, latitude, longitude) float64 200MB nan nan ... 5.589 5.597
    tp         (time, latitude, longitude) float32 100MB nan nan ... 

In [17]:
def add_season_and_year_coords(ds):
    """
    Tag each seasonal timestamp with:
      - season: DJF/MAM/JJA/SON
      - season_year: DJF assigned to the year of Jan/Feb (Dec gets +1)
    """
    month = ds['time'].dt.month
    season = xr.full_like(month, '', dtype=object)
    season = xr.where(month==12, 'DJF', season)
    season = xr.where(month== 3, 'MAM', season)
    season = xr.where(month== 6, 'JJA', season)
    season = xr.where(month== 9, 'SON', season)
    season_year = ds['time'].dt.year.where(month != 12, ds['time'].dt.year + 1)
    return ds.assign_coords(season=('time', season.data),
                            season_year=('time', season_year.data))

def seasons_to_wide_by_year(ds, limit_years=None, drop_aux=('number',)):
    """
    Convert a seasonal 'long' dataset (1 timestamp per season) into a 'wide' dataset with
    variables named <var>_season_<DJF/MAM/JJA/SON>, indexed by 'year'.
    """
    # optional: drop stray coords like 'number' if present
    for c in drop_aux:
        if c in ds.coords:
            ds = ds.drop_vars(c)

    ds = add_season_and_year_coords(ds)

    # pick the year index to use
    years = np.unique(ds['season_year'].values)
    if limit_years is not None:
        y0, y1 = limit_years
        years = years[(years >= y0) & (years <= y1)]

    seasons = ['DJF','MAM','JJA','SON']
    out = {}

    for v in ds.data_vars:
        for s in seasons:
            sel = ds[v].where(ds['season'] == s, drop=True)

            # move season-year into the index we want
            sel = sel.assign_coords(
                year=('time', ds['season_year'].where(ds['season']==s, drop=True).data)
            ).swap_dims({'time':'year'}).drop_vars('time')

            # drop the *conflicting* coords so variables can co-exist in one Dataset
            sel = sel.reset_coords(['season','season_year'], drop=True)

            # align to common year index
            sel = sel.reindex(year=years)

            out[f'{v}_season_{s}'] = sel

    wide = xr.Dataset(out)

    # annotate provenance
    for name in wide.data_vars:
        wide[name].attrs['cell_method'] = 'seasonal aggregate (DJF/MAM/JJA/SON); DJF labeled by Jan/Feb year'

    return wide

# --- Example usage ----------------------------------------------------------
# ds_seasonal is your dataset with time at 1991-12-01, 1992-03-01, ..., 2020-12-01
# If you only want 1992–2020 inclusive (since DJF 1992 needs Dec 1991), pass:
wide_seasons = seasons_to_wide_by_year(ds_era5_sliced_seas, limit_years=(1993, 2020))

print(wide_seasons)

<xarray.Dataset> Size: 2GB
Dimensions:           (latitude: 420, longitude: 510, year: 28)
Coordinates:
  * latitude          (latitude) float64 3kB 71.9 71.8 71.7 ... 30.2 30.1 30.0
  * longitude         (longitude) float64 4kB -11.0 -10.9 -10.8 ... 39.8 39.9
  * year              (year) int64 224B 1993 1994 1995 1996 ... 2018 2019 2020
Data variables: (12/36)
    stl1_season_DJF   (year, latitude, longitude) float64 48MB nan nan ... 283.6
    stl1_season_MAM   (year, latitude, longitude) float64 48MB nan nan ... 297.1
    stl1_season_JJA   (year, latitude, longitude) float64 48MB nan nan ... 308.3
    stl1_season_SON   (year, latitude, longitude) float64 48MB nan nan ... 298.8
    stl4_season_DJF   (year, latitude, longitude) float64 48MB nan nan ... 297.6
    stl4_season_MAM   (year, latitude, longitude) float64 48MB nan nan ... 295.5
    ...                ...
    tp_season_JJA     (year, latitude, longitude) float32 24MB nan ... 2.275e-06
    tp_season_SON     (year, latitude, lon

In [None]:
yearly = xr.open_dataset("data/era5_climate_data/era5-land-yearly-means-sliced.nc")
yearly2 = yearly.sel(year=slice(1993, 2020))
test = xr.merge([yearly2, wide_seasons])
print(test)

<xarray.Dataset> Size: 2GB
Dimensions:           (year: 28, latitude: 420, longitude: 510)
Coordinates:
    number            int64 8B ...
  * latitude          (latitude) float64 3kB 71.9 71.8 71.7 ... 30.2 30.1 30.0
  * longitude         (longitude) float64 4kB -11.0 -10.9 -10.8 ... 39.8 39.9
  * year              (year) int64 224B 1993 1994 1995 1996 ... 2018 2019 2020
Data variables: (12/45)
    vpd               (year, latitude, longitude) float64 48MB ...
    tp                (year, latitude, longitude) float32 24MB ...
    t2m               (year, latitude, longitude) float64 48MB ...
    stl1              (year, latitude, longitude) float64 48MB ...
    stl4              (year, latitude, longitude) float64 48MB ...
    snowc             (year, latitude, longitude) float64 48MB ...
    ...                ...
    tp_season_JJA     (year, latitude, longitude) float32 24MB nan ... 2.275e-06
    tp_season_SON     (year, latitude, longitude) float32 24MB nan ... 0.0001865
    t2m_se

In [20]:
test.to_netcdf("data/era5_climate_data/era5-land-yearly-means-sliced.nc")

In [11]:
ds_era5_sliced_seas.to_netcdf("data/era5_climate_data/seasonal-means.nc")

In [None]:
# def time_weights_days_in_month(ds):
#     """
#     Returns a DataArray of weights -> number of days in each month.
#     """
#     # dt.days_in_month exists for both numpy and cftime time indexes in recent xarray/cftime
#     w = xr.DataArray(ds['time'].dt.days_in_month, coords={'time': ds['time']}, dims='time')
#     w.name = 'days_in_month'
#     return w

# def resample_time_weighted_mean(ds, freq, weights):
#     """
#     Compute time-weighted means over resample bins
#     ds: Dataset or DataArray with time dimension
#     weights: 1D DataArray over 'time'
#     """
#     weights_nan_adjusted = xr.where(ds.notnull(), weights, 0).broadcast_like(ds)
#     num = (ds * weights_nan_adjusted).resample(time=freq).sum(skipna=True, min_count=1)
#     den = weights.resample(time=freq).sum(skipna=True)
#     return num / den

# def resample_time_weighted_mean(ds, freq, weights):
#     """
#     Days-weighted mean over resample bins.

#     ds: Dataset or DataArray with time dimension
#     weights: 1D DataArray over 'time'
#     """
#     def _group_mean(x):
#         w = weights.sel(time=x.time)
#         return x.weighted(w).mean('time')

#     return ds.resample(time=freq).map(_group_mean)

# def calculate_vpd(temp_kelvin, temp_dewpoint_kelvin):
#     """
#     Vapor pressure deficit (VPD) from air temperature (K) and dewpoint (K).
#     Uses Tetens formula.

#     es(T)  = 6.112 * exp(17.67 * Tc / (Tc + 243.5))     [hPa]
#     ea(Td) = 6.112 * exp(17.67 * Tdc / (Tdc + 243.5))   [hPa]
#     VPD    = (es - ea) / 10                             [hPa]
#     """
#     # Convert K to °C
#     temp_degrees = temp_kelvin - 273.15
#     temp_dewpoint_degrees = temp_dewpoint_kelvin - 273.15

#     # Tetens formula
#     es_hPa = 6.1078 * np.exp(17.269 * temp_degrees / (temp_degrees + 237.3))
#     ea_hPa = 6.1078 * np.exp(17.269 * temp_dewpoint_degrees / (temp_dewpoint_degrees + 237.3))

#     # Vapor Pressure Deficit es - ea in hPa
#     vpd_kPa = (es_hPa - ea_hPa)
#     return vpd_kPa


# def add_season_and_year_coords(ds):
#     """
#     After seasonal resampling with 'QS-DEC' (season starting on Dec 1),
#     add two convenience coords:
#     - 'season': DJF/MAM/JJA/SON string for each seasonal timestamp
#     - 'season_year': integer year attributed to the season (DJF -> Jan/Feb year)
#     """
#     month = ds['time'].dt.month
#     season = xr.full_like(month, '', dtype=object)
#     season = xr.where(month==12, 'DJF', season)
#     season = xr.where(month== 3, 'MAM', season)
#     season = xr.where(month== 6, 'JJA', season)
#     season = xr.where(month== 9, 'SON', season)
#     # The 'time' coordinate marks season start: 12->DJF, 3->MAM, 6->JJA, 9->SON
#     # Assign DJF to the year of Jan/Feb (i.e., if month==12, season_year = year+1)
#     season_year = ds['time'].dt.year.where(month != 12, ds['time'].dt.year + 1)

#     ds = ds.assign_coords(season=('time', season.data))
#     ds = ds.assign_coords(season_year=('time', season_year.data))
#     return ds

# def aggregate_era5_ds_OLD(ds, compute_vpd=True):
#     """
#     Returns:
#       clim_yearly  : annual aggregates (means for state vars, sums for tp)
#       clim_seasonal: seasonal aggregates (DJF/MAM/JJA/SON using QS-DEC)
#     """
#     # lat, lon = infer_lat_lon_names(ds)
#     time_weights = time_weights_days_in_month(ds)  # weights by # of days each month

#     # Optional: compute VPD from t2m & d2m (strong ecological signal)
#     if compute_vpd and 't2m' in ds and 'd2m' in ds:
#         ds = ds.copy()
#         ds['vpd'] = vpd_from_t_and_td(ds['t2m'], ds['d2m'])
#         ds['vpd'].attrs.update(units='kPa', long_name='Vapor pressure deficit')

#     # Variables to average (time-weighted means) vs to sum
#     mean_vars = [v for v in ['t2m','stl1','stl4','snowc','rsn','swvl1','swvl4','vpd'] if v in ds]
#     sum_vars  = [v for v in ['tp'] if v in ds]

#     ds_mean = ds[mean_vars] if mean_vars else xr.Dataset()
#     ds_sum  = ds[sum_vars]  if sum_vars  else xr.Dataset()

#     # Annual:
#     yearly_means = resample_time_weighted_mean(ds_mean, 'YS', time_weights) if mean_vars else xr.Dataset()
#     yearly_sums  = ds_sum.resample(time='YS').sum(skipna=True)   if sum_vars  else xr.Dataset()
#     ds_yearly  = xr.merge([yearly_means, yearly_sums])

#     # Tidy attrs
#     for v in ds_yearly.data_vars:
#         if v != 'tp':
#             ds_yearly[v].attrs['cell_method'] = 'time: mean within year (days-weighted)'
#         else:
#             ds_yearly[v].attrs['cell_method'] = 'time: sum within year'
 
#     return ds_yearly

# def aggregate_era5_ds(ds:xr.Dataset, compute_vpd=True, drop_d2m=False):
#     """
#     Build ONE dataset with both annual and seasonal aggregates, named:
#       <var>_yearly, <var>_season_DJF/MAM/JJA/SON
#     Means are days-weighted; precipitation (tp) is summed.
#     Output is indexed by 'year' (integer) plus your spatial dims.
#     """
#     ds = ds.rename({'valid_time': 'time'})

#     # Optional VPD
#     if compute_vpd and 't2m' in ds and 'd2m' in ds:
#         ds = ds.copy()
#         ds['vpd'] = vpd_from_t_and_td(ds['t2m'], ds['d2m'])
#         ds['vpd'].attrs.update(units='kPa', long_name='Vapor pressure deficit')

#     # What to average vs sum
#     mean_vars = [v for v in ['t2m','stl1','stl4','snowc','rsn','swvl1','swvl4','vpd'] if v in ds]
#     sum_vars  = [v for v in ['tp'] if v in ds]

#     ds_mean = ds[mean_vars] if mean_vars else xr.Dataset()
#     ds_sum  = ds[sum_vars]  if sum_vars  else xr.Dataset()

#     # Time weights
#     w = time_weights_days_in_month(ds)

#     # --- ANNUAL ---
#     yearly_means = resample_time_weighted_mean(ds_mean, 'YS', w) if mean_vars else xr.Dataset()
#     yearly_sums  = ds_sum.resample(time='YS').sum(skipna=True)   if sum_vars  else xr.Dataset()
#     ds_yearly    = xr.merge([yearly_means, yearly_sums])
#     ds_yearly    = ds_yearly.assign_coords(year=ds_yearly['time'].dt.year)\
#                              .swap_dims({'time':'year'}).drop_vars('time')

#     # --- SEASONAL (DJF/MAM/JJA/SON, DJF labeled with Jan/Feb year) ---
#     seas_means = resample_time_weighted_mean(ds_mean, 'QS-DEC', w) if mean_vars else xr.Dataset()
#     seas_sums  = ds_sum.resample(time='QS-DEC').sum(skipna=True)    if sum_vars  else xr.Dataset()
#     ds_season  = xr.merge([seas_means, seas_sums])
#     ds_season  = add_season_and_year_coords(ds_season)

#     # Build a single "wide" dataset: copy yearly vars + expand seasons as separate vars
#     out = {}

#     # Annual variables
#     for v in ds_yearly.data_vars:
#         out[f'{v}_yearly'] = ds_yearly[v]

#     # Seasonal variables
#     seasons = ['DJF','MAM','JJA','SON']
#     # use annual year index for alignment
#     years = ds_yearly['year']

#     for v in ds_season.data_vars:
#         for s in seasons:
#             sel = ds_season[v].where(ds_season['season'] == s, drop=True)
#             # carry season_year as the axis, then align to full year index
#             sel = sel.assign_coords(year=('time', ds_season['season_year']
#                                           .where(ds_season['season'] == s, drop=True).data))
#             sel = sel.swap_dims({'time':'year'}).drop_vars('time')
#             sel = sel.reindex(year=years)
#             out[f'{v}_season_{s}'] = sel

#     combined = xr.Dataset(out)

#     # Optional: drop dewpoint from the final dataset but keep derived VPD
#     if drop_d2m and 'd2m' in combined:
#         combined = combined.drop_vars('d2m')

#     # Attributes for transparency
#     for name in combined.data_vars:
#         if name.startswith('tp_'):
#             combined[name].attrs['cell_method'] = 'time: sum within year/season'
#         elif name.endswith('_yearly'):
#             combined[name].attrs['cell_method'] = 'time: mean within year (days-weighted)'
#         else:
#             combined[name].attrs['cell_method'] = 'time: mean within season (days-weighted)'

#     return combined


In [10]:
test = xr.open_dataset("data/era5_climate_data/annual_means_test.nc")

# Test if test["t2m"] and yearly_means["t2m"] have the same values
print("Are the values equal?", xr.DataArray.equals(test["t2m"], yearly_means))
print("Are the values close (allclose)?", np.allclose(test["t2m"], yearly_means, equal_nan=True))

Are the values equal? True
Are the values close (allclose)? True


In [24]:
print(test["swvl1"])

x = test["swvl1"]

<xarray.DataArray 'swvl1' (time: 29, latitude: 420, longitude: 510)> Size: 50MB
[6211800 values with dtype=float64]
Coordinates:
    number     int64 8B ...
  * latitude   (latitude) float64 3kB 71.9 71.8 71.7 71.6 ... 30.2 30.1 30.0
  * longitude  (longitude) float64 4kB -11.0 -10.9 -10.8 ... 39.7 39.8 39.9
  * time       (time) datetime64[ns] 232B 1992-01-01 1993-01-01 ... 2020-01-01
