# ClimData Tutorial
This notebook demonstrates usage of the `ClimData` class for climate data extraction, extreme index computation, and workflow management.
Includes examples for point-based and box-based extraction, variable exploration, and error handling.

# 1️⃣ Imports

In [1]:
from climdata import ClimData
import pandas as pd
import xarray as xr

import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s | %(message)s",
    force=True,
)

## 2️⃣ Explore available datasets

In [2]:
extractor = ClimData()
datasets = extractor.get_datasets()
print(datasets)

['dwd', 'mswx', 'hyras', 'cmip', 'power', 'w5e5', 'cmip_w5e5', 'nexgddp']


## 3️⃣ Explore variables for a dataset

In [3]:
variables = extractor.get_variables('w5e5')
print(variables)

# for CMIP
import climdata
extractor_CMIP = climdata.CMIP(extractor.cfg)
print(extractor_CMIP.get_experiment_ids())
print(extractor_CMIP.get_source_ids('ssp245'))
print(extractor_CMIP.get_variables(experiment_id='ssp245',source_id='ACCESS-CM2'))


['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']
   the typical Historical period (1850-2014).
   Data availability may be limited.
['historical', 'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp434', 'ssp460', 'ssp585']


/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
INFO | 46 models found for experiment 'ssp245'


['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'BCC-CSM2-MR', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CESM2', 'CESM2-WACCM', 'CIESM', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5', 'CanESM5-CanOE', 'E3SM-1-1', 'EC-Earth3', 'EC-Earth3-CC', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4', 'GISS-E2-1-G', 'GISS-E2-1-H', 'HadGEM3-GC31-LL', 'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G', 'KIOST-ESM', 'MCM-UA-1-0', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL']
['hurs', 'pr', 'sfcWind', 'tas', 'tasmax', 'tasmin']


## 4️⃣ Explore metadata for a variable

In [7]:
variables = extractor.get_variables('w5e5')
print(variables)
print("*"*70)
varinfo = extractor.get_varinfo('rlds')
print(varinfo)

['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']
**********************************************************************


ValueError: Variable 'rlds' not found in varinfo

## 5️⃣ Explore available workflow actions

In [5]:
actions = extractor.get_actions()
print(actions.keys())

dict_keys(['extract', 'calc_index', 'impute', 'to_nc', 'to_csv', 'upload_netcdf', 'upload_csv'])


In [6]:
indices = extractor.get_indices(['tasmin', 'tasmax'])
print(indices.keys())

impute_methods = extractor.get_impute_methods()
print(impute_methods.keys())

dict_keys(['heat_wave_index', 'heat_wave_frequency', 'heat_wave_max_length', 'heat_wave_total_length', 'hot_spell_frequency', 'hot_spell_max_length', 'hot_spell_total_length', 'hot_spell_max_magnitude', 'ice_days', 'isothermality', 'maximum_consecutive_frost_days', 'maximum_consecutive_frost_free_days', 'maximum_consecutive_tx_days'])
dict_keys(['BRITS', 'XGBOOST', 'CDRec', 'SoftImpute'])


## 6️⃣ Point extraction workflow

In [7]:
import json

# -----------------------------
# Step 1: Define the area of interest (AOI)
# -----------------------------
# The AOI is a single point. In GeoJSON format, the coordinates are [longitude, latitude].
geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "coordinates": [
          24.246667038198012,  # longitude
          12.891982026993958   # latitude
        ],
        "type": "Point"
      }
    }
  ]
}


# -----------------------------
# Step 2: Define configuration overrides
# -----------------------------
# Overrides are strings used by Hydra to modify default configurations at runtime.
overrides = [
    "dataset=cmip",  # Choose the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set the AOI as the point defined above
    f"time_range.start_date=2004-01-01",  # Start date for data extraction
    f"time_range.end_date=2014-12-31",    # End date for data extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temp and precipitation
    "data_dir=/beegfs/muduchuru/data",    # Local directory to store raw/intermediate files
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",  # optional . required for MSWS data download
    "index=tn10p",  # Climate extreme index to calculate
    "impute=BRITS"
]

# -----------------------------
# Step 3: Define the workflow sequence
# -----------------------------
seq = ["extract", "impute", "calc_index", "to_nc"]

# -----------------------------
# Step 4: Initialize the ClimData extractor
# -----------------------------
extractor = ClimData(overrides=overrides)

# -----------------------------
# Step 5: Run the Multi-Step workflow
# -----------------------------
result = extractor.run_workflow(
    actions=seq,
)

INFO | Starting action: extract
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.

In [8]:
import json

# -----------------------------
# Define the area of interest (AOI)
# -----------------------------
# This AOI is a single point with latitude 12.891982026993958 and longitude 24.246667038198012
geojson = {
    "type": "FeatureCollection",
    "features": [
        {
            "type": "Feature",
            "properties": {},
            "geometry": {
                "coordinates": [24.246667038198012, 12.891982026993958],
                "type": "Point"
            }
        }
    ]
}

# -----------------------------
# Define configuration overrides
# -----------------------------
# These strings override the default hydra config at runtime
overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set AOI as the point defined above
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temperature & precipitation
    "data_dir=/beegfs/muduchuru/data",    # Local directory to store downloaded/intermediate files
    # Optional Google service account if needed for MSWX access
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",
    "index=tn10p",  # Extreme climate index to calculate
]

# -----------------------------
# Initialize the ClimData extractor
# -----------------------------
# This loads the configuration with overrides and prepares the object
extractor = ClimData(overrides=overrides)

# -----------------------------
# Extract climate data
# -----------------------------
# Returns an xarray.Dataset for the selected variables, AOI, and time range
ds = extractor.extract()

# -----------------------------
# Compute the climate index
# -----------------------------
# Takes the extracted dataset and calculates the extreme index "tn10p"
# Returns a new xarray.Dataset containing only the index
ds_index = extractor.calc_index(ds)

# -----------------------------
# Convert the index dataset to a long-form pandas DataFrame
# -----------------------------
# Each row corresponds to a time, lat, lon, and variable (here just "tn10p")
df_index = extractor.to_dataframe(ds_index)

# -----------------------------
# Save the DataFrame to CSV
# -----------------------------
# This will write the index values to "index.csv" in the current working directory
extractor.to_csv(df_index, filename="index.csv")


✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.
✅ All 31 pr files already exist locally.


INFO | DataFrame saved to CSV file: index.csv


'index.csv'

### Output filenames

In [9]:
print(extractor.current_filename)
# print(extractor_point.filename_nc)

index.csv


## 7️⃣ Box extraction workflow

In [10]:
box_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    "region=europe", # Select the region
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=/beegfs/muduchuru/data",    # Local directory to store downloaded/intermediate files
]

extractor_box = ClimData(overrides=box_overrides)
result_box = extractor_box.run_workflow(actions=["extract", "to_csv"])


INFO | Starting action: extract


✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.


INFO | Completed action: extract
INFO | Starting action: to_csv
INFO | DataFrame saved to CSV file: mswx_tasmin_tasmax_LAT_34.0_71.0_LON_-25.0_45.0_2014-12-01_2014-12-31.csv
INFO | Completed action: to_csv


## 8️⃣ Compute extreme index only

In [None]:
lat_berlin, lon_berlin = [52.5,13.4]
idx_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"lat={lat_berlin}", # Select the region
    f"lon={lon_berlin}",
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=/beegfs/muduchuru/data",    # Local directory to store downloaded/intermediate files
    "index=heat_wave_max_length"
]


extractor_idx = ClimData(overrides=idx_overrides)
result_idx = extractor_idx.run_workflow(actions=["extract", "calc_index", "to_csv"])
result_idx.dataframe.head()

INFO | Starting action: extract


✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.


INFO | Completed action: extract
INFO | Starting action: calc_index
INFO | Completed action: calc_index
INFO | Starting action: to_dataframe
ERROR | Action 'to_dataframe' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 862, in run_workflow
    raise ValueError(f"Unknown action '{action}'")
ValueError: Unknown action 'to_dataframe'


ValueError: Unknown action 'to_dataframe'

## 9️⃣ Error examples

In [None]:
try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["calc_index"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["to_csv"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["upload_netcdf"])
except Exception as e:
    print("Error:", e)

INFO | Starting action: calc_index
ERROR | Action 'calc_index' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 760, in run_workflow
    raise ValueError(
ValueError: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.


Error: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.


INFO | Starting action: to_csv
ERROR | Action 'to_csv' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 778, in run_workflow
    raise ValueError(
ValueError: Action 'to_csv' requires a DataFrame, but no DataFrame is available. Use 'to_dataframe' or upload a CSV before saving.


Error: Action 'to_csv' requires a DataFrame, but no DataFrame is available. Use 'to_dataframe' or upload a CSV before saving.


INFO | Starting action: upload_netcdf
ERROR | Action 'upload_netcdf' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 717, in run_workflow
    raise ValueError(
ValueError: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.


Error: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.
