In [1]:
# %pip install xarray netCDF4

In [2]:
import xarray as xr
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime, timedelta
from pathlib import Path

# Prepare datasets for DDAS

## 0. User inputs

In [3]:
# Define input files 
# 1. Raw water chemistry data from rivers
wc_raw_data_dir = Path("../../data/river/water_chemistry/raw")
wc_raw_data_file_paths = list(wc_raw_data_dir.glob("water_chem_*_raw_12-06-2025.csv"))

# 2. Stations metadata
stn_metadata_file_paths = list(wc_raw_data_dir.glob("water_chem_*_metadata_12-06-2025.csv"))

# 3. Parameters metadata
pars_metadata_file_path = "../../data/river/water_chemistry/raw/water_chemistry_parameters_units.csv"

# Define a mapping from parameter names to long names
standard_name_map = {
    "DOC1539": "Dissolved Organic Carbon",
    "DOC570": "Dissolved Organic Carbon",
    "Farge": "Farge/ Water Color",
    "NH4-N": "Ammonium",
    "NO3+NO2-N": "Nitrate and Nitrite",
    "NO3-N": "Nitrate",
    "PO4-P": "Total Reactive Phosphorus",
    "POC": "Particulate Organic Carbon",
    "STS": "Suspendert Tørrstoff/ Suspended Particulate Matter", 
    "Si": "Silicon",
    "SiO2": "Silica", 
    "TOC": "Total Organic Carbon",
    "TOTN": "Total Organic Nitrogen",
    "TOTN_EF_usikker": "Total Organic Nitrogen",
    "TOTP": "Total Phosphorus",
    "TOTP_F": "Total Dissolved Phosphorus",
    "TOTP_P": "Total Particulate Phosphorus",
    "TSM": "Total Suspended Matter",
    "UV-Abs. 254nm": "ultraviolet_absorbance_254nm",
    "UV-Abs. 410nm": "ultraviolet_absorbance_410nm",
}

# Where to save 
output_dir = Path("../../data/river/water_chemistry/raw") 
output_dir.mkdir(parents=True, exist_ok=True)

## 1. Load data

In [4]:
# Read raw water chemistry data and metadata
wc_raw_dfs = [pd.read_csv(fp) for fp in wc_raw_data_file_paths]
stn_metadata_dfs = [pd.read_csv(fp) for fp in stn_metadata_file_paths]
pars_metadata_df = pd.read_csv(pars_metadata_file_path)

## 2. Create datasets and assign metadata

In [5]:
rivers_raw_datasets = []

for df, meta_df in zip(wc_raw_dfs, stn_metadata_dfs):
    # Make sure date is datetime
    df["sample_date"] = pd.to_datetime(df["sample_date"])

    # Remove constant fields from dataframe to avoid them being treated as varying
    constant_fields = ["station_id", "station_code", "station_name", "station_type"]
    df_clean = df.drop(columns=constant_fields, errors="ignore")

    # Convert dataframe to dataset
    ds = xr.Dataset.from_dataframe(df_clean.set_index("sample_date"))

    # Add coordinates 
    lat = meta_df["latitude"].iloc[0]
    lon = meta_df["longitude"].iloc[0]
    ds = ds.assign_coords(
        latitude=xr.DataArray(lat, dims=(), attrs={"standard_name": "latitude", "units": "degree_north"}),
        longitude=xr.DataArray(lon, dims=(), attrs={"standard_name": "longitude", "units": "degree_east"})
    )

    # Add station info
    meta_row = meta_df.iloc[0]
    ds["station_id"] = xr.DataArray(meta_row["station_id"], dims=())
    ds["station_code"] = xr.DataArray(meta_row["station_code"], dims=())
    ds["station_name"] = xr.DataArray(meta_row["station_name"], dims=(), attrs={"cf_role": "timeseries_id"})
    ds["station_type"] = xr.DataArray(meta_row["station_type"], dims=())

    rivers_raw_datasets.append(ds)


## 3. Add metadata for each data variable 

In [6]:
for ds in rivers_raw_datasets:
    for var in ds.data_vars:
        print()
        if var in ["station_id", "station_code", "station_name", "station_type", "depth1", "depth2"]:
            continue

        # Special case: DOCs have parameter_id in their name
        if var.startswith("DOC") and var[3:].isdigit():
            param_id = int(var[3:])
            match = pars_metadata_df[pars_metadata_df["parameter_id"] == param_id]
            ds[var].attrs["comment"] = (
                "DOC has been renamed using the parameter ID to distinguish between two DOC measurements."
            )
        else:
            match = pars_metadata_df[pars_metadata_df["parameter_name"] == var]

        if not match.empty:
            row = match.iloc[0]
            param_name = row["parameter_name"]
            ds[var].attrs["parameter_id"] = int(row["parameter_id"])
            ds[var].attrs["units"] = row["unit"]
            ds[var].attrs["parameter_name"] = param_name

            long_name = standard_name_map.get(var, param_name)
            ds[var].attrs["long_name"] = long_name

            # Comment for specific var
            if var == "TOTN_EF_usikker":
                ds[var].attrs["comment"] = (
                    "Total Nitrogen from a different monitoring program. This is often used to fill missing TOTN values."
                )
        else:
            print(f"Warning! No metadata found for variable '{var}'")


















































































In [7]:
rivers_raw_datasets[0]["DOC1539"].attrs

{'comment': 'DOC has been renamed using the parameter ID to distinguish between two DOC measurements.',
 'parameter_id': 1539,
 'units': 'mg/L C',
 'parameter_name': 'DOC',
 'long_name': 'Dissolved Organic Carbon'}

In [8]:
rivers_raw_datasets[1]["DOC570"].attrs

{'comment': 'DOC has been renamed using the parameter ID to distinguish between two DOC measurements.',
 'parameter_id': 570,
 'units': 'mg/l',
 'parameter_name': 'DOC',
 'long_name': 'Dissolved Organic Carbon'}

In [9]:
rivers_raw_datasets[2]["STS"].attrs

{'parameter_id': 396,
 'units': 'mg/l',
 'parameter_name': 'STS',
 'long_name': 'Suspendert Tørrstoff/ Suspended Particulate Matter'}

## 4. Assign global attributes

In [11]:
for ds in rivers_raw_datasets:
    station_name = ds["station_name"].values.item()
    station_code = ds["station_code"].values.item()

    ds.attrs = dict(
        # id= ,
        naming_authority="niva.no",
        title=f"Water chemistry measurements at station {station_name}",
        title_no=f"Kjemiske målinger ved stasjon {station_name}",
        summary=f"Long-term water chemistry monitoring at station {station_name} (code: {station_code})",
        summary_no=f"Langsiktige vannkjemiske målinger ved stasjon {station_name} (kode: {station_code})",
        keywords="GCMDSK:EARTH SCIENCE > WATER QUALITY > CHEMISTRY, GCMDLOC:CONTINENT > EUROPE > NORWAY",
        keywords_vocabulary="GCMDSK:GCMD Science Keywords, GCMDLOC:GCMD Locations",
        iso_topic_category="inlandWaters",
        featureType="timeseries",
        date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        project="AquaInfra",
        time_coverage_start=np.datetime_as_string(ds.sample_date.min().values, unit="s", timezone="UTC"),
        time_coverage_end=np.datetime_as_string(ds.sample_date.max().values, unit="s", timezone="UTC"),
        geospatial_lat_min=float(ds.latitude.values),
        geospatial_lat_max=float(ds.latitude.values),
        geospatial_lon_min=float(ds.longitude.values),
        geospatial_lon_max=float(ds.longitude.values),
        spatial_representation="point",
        creator_type="institution",
        creator_institution="NIVA",
        institution="NIVA",
        institution_short_name="NIVA",
        creator_email="areti.balkoni@niva.no",
        creator_url="https://www.niva.no/en/employees/areti-balkoni",
        data_owner="NIVA",
        processing_level="Raw",
        Conventions="CF-1.7, ACDD-1.3",
        publisher_name="NIVA",
        publisher_email="publisher@niva.no",
        publisher_url="https://www.niva.no",
        license="http://spdx.org/licenses/CC-BY-4.0(CC-BY-4.0)",
        history="Created on jupyterhub",
    )

  date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
  date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
  date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),


In [12]:
rivers_raw_datasets[2]

## 5. Store the datasets 

In [14]:
for ds in rivers_raw_datasets:
    station_id = ds["station_id"].values.item()
    filename = output_dir / f"riverchem_{station_id}.nc"

    ds.to_netcdf(
        path=filename,
        mode="w",
        format="NETCDF4",
        unlimited_dims=["sample_date"], 
        encoding=dict(
            sample_date={
                "dtype": "int32",
                "_FillValue": None,
                "units": "seconds since 1970-01-01 00:00:00",
            },
            longitude={"_FillValue": None},
            latitude={"_FillValue": None},
        ),
    )
    print(f"Saved {filename}")

Saved ..\..\data\river\water_chemistry\raw\riverchem_Drammenselva.nc
Saved ..\..\data\river\water_chemistry\raw\riverchem_Glomma, Sarpsfossen.nc


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\ARB\\PycharmProjects\\pythonProject\\niva-aquainfra\\data\\river\\water_chemistry\\raw\\riverchem_Numedalslågen v\\E18.nc'