In [1]:
# # Run this and then restart the kernel at the start of each session to install
# # 'teotil3' in development mode
# !pip install -e /home/jovyan/projects/teotil3/

In [2]:
import os
import random
import warnings

import contextily as cx
import geopandas as gpd
import matplotlib.pyplot as plt
import nivapy3 as nivapy
import pandas as pd
import teotil3 as teo

warnings.simplefilter("ignore")

In [3]:
# Connect to JupyterHub's PostGIS database
eng = nivapy.da.connect_postgis()

Connection successful.


# Improving suspended sediment coefficients in TEOTIL3

## Notebook 01: Find and filter SS data

**Run this notebook on a high memory machine (e.g. 48 GB).**

Ivar Berthling has provided a CSV of SS concentrations from NVE's monitoring network (see e-mail received 26.05.2025). 

Of the diffuse sources of SS in TEOTIL, agricultural losses are modelled by NIBIO and urban losses are based on coefficients from Åstebøl et al. (2012). However, **background** losses (from woodland, upland and glacier areas) are based on coefficients from Bogen (1996), which should be improved if possible.

This notebook reads the NVE data and gets extra data from Vannmiljø.

## 1. User options

In [4]:
# Paths to raw data from NVE
teo_data_dir = r"/home/jovyan/shared/common/teotil3/core_data"
nve_data_dir = r"/home/jovyan/shared/common/teotil3/nve_ss_data"
ss_csv = r"hydraII_1200.csv"
stn_csv = "hydraII_1200_metadata.csv"

# Ignore very large catchments because they are not useful for this analysis
# and they increase processing time
max_cat_size = 2000  # km2

In [5]:
# Options for filtering data from Vannmiljø

# List of par names to consider in Vannmiljø (note that these are all considered to be
# "SS" and will be aggregated)
vm_pars = ["TSM", "STS"]

# List of "activities" to consider
act_list = [
    "Annet",
    "Tiltaksorientert overvåking",
    "Forsuringsovervåking",
    "Problemkartlegging",
    "Elveovervåkingsprogrammet",
    "Basisovervåking - referanseforhold",
    "Basisovervåking - påvirka områder",
    "Overvåking av badevann",
    "Myndighetspålagt forurensningsovervåking",
    "Kartlegging av arter",
    "Økosystemovervåking i store innsjøer (ØkoStor)",
    "Referanseelver",
    "Overvåking av drikkevann",
    "Effekter av planlagt arealbruk",
    "Tiltaksovervåking i kalkede laksevassdrag",
    "Elvetilførselsprogrammet",
    "Effekter av vassdragsinngrep",
    "Økosystemovervåkning i ferskvann (ØkoFersk)",
    "Lokal overvåking av kalka vassdrag",
    "Overvåking av biologisk mangfold",
    "Miljøgifter i ferskvann (MilFersk)",
]

# List of "media" to consider
med_list = ["Ferskvann"]

# List of waterbody types to consider (R = rivers)
cat_list = ["R"]

# Only use samples collected shallower than this depth (in metres)
depth_thresh = 2

## 2. Read raw data

### 2.1. Data from NVE

In [6]:
# Read SS data. All values are mg/l
nve_df = pd.read_csv(os.path.join(nve_data_dir, ss_csv))
nve_df = nve_df.rename(columns={"Unnamed: 0": "date"})
nve_df["date"] = pd.to_datetime(nve_df["date"])
nve_df = nve_df.set_index("date")

# Read station properties
nve_stn_df = pd.read_csv(os.path.join(nve_data_dir, stn_csv))
nve_stn_df = nve_stn_df.rename(columns={"Unnamed: 0": "station_id"})
del nve_stn_df["perc_eff_bog"]

# Add lat and lon cols
nve_stn_df = nivapy.spatial.utm_to_wgs84_dd(
    nve_stn_df, zone="utm_zone", east="utm_east", north="utm_north"
)

stn_ids_data = set(list(nve_df.columns))
stn_ids_metadata = set(list(nve_stn_df["station_id"]))
assert stn_ids_data == stn_ids_metadata

# Convert df to long
nve_df = nve_df.reset_index()
nve_df = nve_df.melt(
    id_vars="date", var_name="station_id", value_name="SS_mgpl"
).dropna()
nve_df = nve_df[["station_id", "date", "SS_mgpl"]].sort_values(["station_id", "date"])
nve_df = nve_df.query("SS_mgpl > 0")
nve_stn_df = nve_stn_df[["station_id", "station_name", "lon", "lat"]]

display(nve_df.head())
display(nve_stn_df.head())

Unnamed: 0,station_id,date,SS_mgpl
832503,104.1.0,2006-05-04,1.5935
832504,104.1.0,2006-05-05,2.3915
832505,104.1.0,2006-05-06,1.4375
832506,104.1.0,2006-05-07,1.727
832507,104.1.0,2006-05-08,2.014


Unnamed: 0,station_id,station_name,lon,lat
0,161.28.0,Øvre Beiarelv,14.482063,66.692505
1,2.479.0,Li bru,10.000331,62.009974
2,124.15.0,Bøstad,11.059873,63.467091
3,38.1.0,Holmen,5.913059,59.498826
4,124.72.0,Øvre Gråelva,11.085682,63.483747


### 2.2. Data from Vannmiljø

In [7]:
# Get all data for vassom of interest
data = {
    "FromRegDate": "1900-01-01",
    "ParameterIDFilter": vm_pars,
}
vm_df = nivapy.da.post_data_to_vannmiljo("GetRegistrations", data=data)

# Tidy to cols of interest
names_dict = {
    "WaterLocationCode": "station_id",
    "Name": "station_name",
    "WaterCategory": "category",
    "CoordX": "utm33_east",
    "CoordY": "utm33_north",
    "FylkeID": "fylke_id",
    "Fylke": "fylke_name",
    "KommuneID": "kommune_id",
    "Kommune": "kommune_name",
    "VassdragsomradeID": "vassom_id",
    "Vassdragsomrade": "vassom_name",
    "VannomradeID": "vannom_id",
    "Vannomrade": "vannom_name",
    "VannregionID": "vannreg_id",
    "Vannregion": "vannreg_name",
    "WaterBodyID": "waterbody_id",
    "WaterBody": "waterbody_name",
    "FeatureType": "feature_type",
    "ActivityID": "activity_id",
    "ActivityName": "activity_name",
    "Employer": "employer",
    "Contractor": "contractor",
    "MediumName": "medium_name",
    "SamplingTime": "date",
    "UpperDepth": "upper_depth",
    "LowerDepth": "lower_depth",
    "FilteredSample": "filtered",
    "ParameterID": "par_id",
    "ParameterName": "par_name",
    "ValueOperator": "flag",
    "RegValue": "value",
    "Unit": "unit",
    "DetectionLimit": "lod",
    "QuantificationLimit": "loq",
}
stn_cols = [
    "station_id",
    "station_name",
    "feature_type",
    "category",
    "fylke_id",
    "fylke_name",
    "kommune_id",
    "kommune_name",
    "vassom_id",
    "vassom_name",
    "vannom_id",
    "vannom_name",
    "vannreg_id",
    "vannreg_name",
    "waterbody_id",
    "waterbody_name",
    "utm33_east",
    "utm33_north",
]
vm_df = vm_df[names_dict.keys()].rename(columns=names_dict)
vm_df = vm_df.query(
    "(medium_name in @med_list) and "
    "(category in @cat_list) and "
    "(activity_name in @act_list) and "
    "(filtered == False) and "
    "(flag != '>') and "
    "(value > 0)"
)
vm_df["upper_depth"] = vm_df["upper_depth"].fillna(0)
vm_df["lower_depth"] = vm_df["lower_depth"].fillna(0)
vm_df = vm_df.query("(lower_depth <= @depth_thresh) and (upper_depth <= @depth_thresh)")
vm_df["date"] = pd.to_datetime(vm_df["date"])

vm_stn_df = vm_df[stn_cols].drop_duplicates(subset="station_id")

# Add lat and lon cols
vm_stn_df["utm_zone"] = 33
vm_stn_df = nivapy.spatial.utm_to_wgs84_dd(
    vm_stn_df, zone="utm_zone", east="utm33_east", north="utm33_north"
)
vm_stn_df = vm_stn_df[["station_id", "station_name", "lon", "lat"]]

# Aggregate chem data
assert vm_df["unit"].unique().item() == "mg/l"
vm_df = vm_df[["station_id", "date", "value"]].sort_values(["station_id", "date"])
vm_df.columns = ["station_id", "date", "SS_mgpl"]
vm_dup_df = vm_df[vm_df.duplicated(["station_id", "date"], keep=False)]
vm_df["date"] = vm_df["date"].dt.normalize()
vm_df = vm_df.groupby(["station_id", "date"]).mean().reset_index()

display(vm_df.head())
display(vm_stn_df.head())

Unnamed: 0,station_id,date,SS_mgpl
0,001-27954,1978-02-20,0.4
1,001-27954,1978-10-25,0.5
2,001-27955,1978-02-20,0.5
3,001-27958,1978-02-21,1.9
4,001-27958,1978-10-24,0.8


Unnamed: 0,station_id,station_name,lon,lat
1876,124-27874,Vollselva 4,10.858576,63.498918
1877,124-27875,Kvithamarbekken,10.883033,63.481171
1878,003-27942,Veidalselva,10.896369,59.485378
2223,003-27945,Svinna,10.946741,59.441902
2665,003-27946,Mørkelva,10.923217,59.490118


In [8]:
# Combine
nve_stn_df["source"] = "NVE"
vm_stn_df["source"] = "Vannmiljø"
df = pd.concat([nve_df, vm_df], axis="rows").reset_index(drop=True)
stn_df = pd.concat([nve_stn_df, vm_stn_df], axis="rows").reset_index(drop=True)

# Convert to gdf
stn_gdf = gpd.GeoDataFrame(
    stn_df,
    geometry=gpd.points_from_xy(stn_df["lon"], stn_df["lat"], crs="epsg:4326"),
).to_crs("epsg:25833")

print(f"{len(stn_df)} stations in dataset.")

display(df.head())
display(stn_df.head())

3807 stations in dataset.


Unnamed: 0,station_id,date,SS_mgpl
0,104.1.0,2006-05-04,1.5935
1,104.1.0,2006-05-05,2.3915
2,104.1.0,2006-05-06,1.4375
3,104.1.0,2006-05-07,1.727
4,104.1.0,2006-05-08,2.014


Unnamed: 0,station_id,station_name,lon,lat,source
0,161.28.0,Øvre Beiarelv,14.482063,66.692505,NVE
1,2.479.0,Li bru,10.000331,62.009974,NVE
2,124.15.0,Bøstad,11.059873,63.467091,NVE
3,38.1.0,Holmen,5.913059,59.498826,NVE
4,124.72.0,Øvre Gråelva,11.085682,63.483747,NVE


## 3. Catchment boundaries

In [9]:
%%capture

cat_gdf = nivapy.spatial.derive_watershed_boundaries(
    stn_df,
    id_col="station_id",
    xcol="lon",
    ycol="lat",
    crs="epsg:4326",
    min_size_km2=2,
    dem_res_m=40,
    buffer_km=None,
    temp_fold=None,
    reproject=False,
)

In [10]:
print(f"Boundaries derived for {len(cat_gdf)} catchments.")

Boundaries derived for 3802 catchments.


## 4. Metrics for filtering

Need to filter based on catchment area, the proportion of each catchment occupied by lakes, and the proportion of (agricultural + urban) land.

In [11]:
# Catchment area
cat_gdf["area_km2"] = cat_gdf.to_crs({"proj": "cea"}).geometry.area / 1e6
cat_gdf = cat_gdf.query("area_km2 <= @max_cat_size")
print(f"{len(cat_gdf)} catchments with area <= {max_cat_size} km2.")

# Read AR50
ar50_gdb = os.path.join(teo_data_dir, "land_cover", "nibio_ar50.gdb")
ar50_gdf = gpd.read_file(ar50_gdb, driver="fileGDB", layer="org_ar_ar50_flate")

print("Reclassifying...")
land_class_csv = r"https://raw.githubusercontent.com/NIVANorge/teotil3/main/data/ar50_artype_classes.csv"
artype_df = pd.read_csv(land_class_csv)
ar50_gdf = pd.merge(ar50_gdf, artype_df, how="left", on="artype")
ar50_gdf = ar50_gdf[["teotil", "geometry"]]

print("Reprojecting to equal area...")
cat_gdf_cea = cat_gdf.to_crs({"proj": "cea"})
ar50_gdf_cea = ar50_gdf.to_crs({"proj": "cea"})
cat_gdf_cea.sindex
ar50_gdf_cea.sindex

print("Intersecting polygons...")
int_gdf = gpd.overlay(
    cat_gdf_cea, ar50_gdf_cea, how="intersection", keep_geom_type=True
)
int_gdf["area_km2"] = int_gdf["geometry"].area / 1e6

print("Aggregating...")
lc_df = int_gdf.groupby(["station_id", "teotil"]).sum(numeric_only=True)["area_km2"]
lc_df = lc_df.unstack("teotil")
lc_df.columns = [f"a_{i}_km2" for i in lc_df.columns]
lc_df.reset_index(inplace=True)
lc_df.columns.name = ""

cat_gdf = pd.merge(cat_gdf, lc_df, on="station_id", how="left")
cols = [
    "a_agri_km2",
    "a_glacier_km2",
    "a_lake_km2",
    "a_sea_km2",
    "a_upland_km2",
    "a_urban_km2",
    "a_wood_km2",
    "a_other_km2",
]
for col in cols:
    if col in cat_gdf.columns:
        cat_gdf[col] = cat_gdf[col].fillna(0)

# Get pct (urban + agri)
cat_gdf["a_anthrop_km2"] = cat_gdf["a_agri_km2"] + cat_gdf["a_urban_km2"]
cat_gdf["a_anthrop_pct"] = 100 * cat_gdf["a_anthrop_km2"] / cat_gdf["area_km2"]

# Get pct lake
cat_gdf["a_lake_pct"] = 100 * cat_gdf["a_lake_km2"] / cat_gdf["area_km2"]

3527 catchments with area <= 2000 km2.
Reclassifying...
Reprojecting to equal area...
Intersecting polygons...
Aggregating...


## 5. Remove catchments with point discharges

In [12]:
# Get point discharges of SS
df_list = []
for year in range(2013, 2024):
    for source in ["industry", "large wastewater"]:
        df_list.append(
            teo.io.get_raw_annual_point_data(
                eng,
                year,
                source,
                par_list=["ss_kg"],
            )
        )
pt_df = pd.concat(df_list, axis="rows")
pt_df["utm_zone"] = 33
pt_df = nivapy.spatial.utm_to_wgs84_dd(
    pt_df, zone="utm_zone", east="outlet_x_utm33", north="outlet_y_utm33"
)
pt_df = pt_df.query("SS_kg > 0")[["site_id", "lon", "lat"]].drop_duplicates()

# Identify any point discharges within catchments of interest
cats_with_pt = nivapy.spatial.identify_point_in_polygon(
    pt_df,
    cat_gdf,
    pt_col="site_id",
    poly_col="station_id",
    lat_col="lat",
    lon_col="lon",
).dropna(subset="station_id")

# Drop catchments with point discharges of SS
cat_ids_to_drop = cats_with_pt["station_id"].unique().tolist()
n_drop = len(cat_ids_to_drop)
cat_gdf = cat_gdf.query("station_id not in @cat_ids_to_drop")
print(
    f"\n{n_drop} catchments contain point inputs.\n"
    f"These have been removed from the dataset. {len(cat_gdf)} catchments remain."
)


398 catchments contain point inputs.
These have been removed from the dataset. 3129 catchments remain.


## 6. Summary

In [13]:
# Tidy catchments
cols = [
    "source",
    "station_id",
    "station_name",
    "lon",
    "lat",
    "area_km2",
    "a_agri_km2",
    "a_glacier_km2",
    "a_lake_km2",
    "a_upland_km2",
    "a_urban_km2",
    "a_wood_km2",
    "a_other_km2",
    "a_lake_pct",
    "a_anthrop_pct",
    "geometry",
]
cat_gdf = cat_gdf[cols].copy()

# Filter data to match
valid_cat_ids = cat_gdf["station_id"].unique().tolist()
stn_gdf = stn_gdf.query("station_id in @valid_cat_ids")
stn_df = stn_df.query("station_id in @valid_cat_ids")
df = df.query("station_id in @valid_cat_ids")
assert len(stn_df) == len(cat_gdf)
cat_df = cat_gdf.drop(columns="geometry")

display(stn_df.head())
display(df.head())
display(cat_df.head())

Unnamed: 0,station_id,station_name,lon,lat,source
0,161.28.0,Øvre Beiarelv,14.482063,66.692505,NVE
1,2.479.0,Li bru,10.000331,62.009974,NVE
3,38.1.0,Holmen,5.913059,59.498826,NVE
5,156.26.0,Høgtuvbreen,13.688714,66.459726,NVE
8,159.7.0,Dimdalen,13.862203,66.725134,NVE


Unnamed: 0,station_id,date,SS_mgpl
0,104.1.0,2006-05-04,1.5935
1,104.1.0,2006-05-05,2.3915
2,104.1.0,2006-05-06,1.4375
3,104.1.0,2006-05-07,1.727
4,104.1.0,2006-05-08,2.014


Unnamed: 0,source,station_id,station_name,lon,lat,area_km2,a_agri_km2,a_glacier_km2,a_lake_km2,a_upland_km2,a_urban_km2,a_wood_km2,a_other_km2,a_lake_pct,a_anthrop_pct
0,Vannmiljø,001-27954,Stømselva ved Strømsfoss,11.66133,59.301617,1156.631931,149.685332,0.0,98.484435,60.833131,15.077529,831.296826,1.254679,8.51476,14.245055
1,Vannmiljø,001-27955,"Ørjeelva, utløp Rødenessjøen",11.651673,59.480705,1008.740121,132.659508,0.0,79.609429,54.036729,13.606848,727.673067,1.15454,7.891966,14.499905
2,Vannmiljø,001-27958,Lierelva ved Lierfoss,11.532151,59.919631,133.905854,22.059571,0.0,4.68034,4.927153,3.015241,99.223549,0.0,3.495246,18.725703
3,Vannmiljø,001-27959,Leirelva ved Berger,11.494237,59.927223,121.082054,17.666327,0.0,4.629687,4.895623,2.436345,91.454072,0.0,3.823594,16.602519
4,Vannmiljø,001-27961,Haretonelva ved Solbergbråten,11.496777,59.96744,45.526412,0.752827,0.0,3.108178,3.12583,0.04961,38.489966,0.0,6.827197,1.762574


In [14]:
nivapy.spatial.quickmap(
    stn_df, cluster=True, popup="station_id", lon_col="lon", lat_col="lat"
)

In [15]:
cat_gdf.to_file(os.path.join(nve_data_dir, "filtered_catchments.gpkg"))
with pd.ExcelWriter(os.path.join(nve_data_dir, "filtered_data.xlsx")) as writer:
    df.to_excel(writer, sheet_name="data", index=False)
    stn_df.to_excel(writer, sheet_name="stations", index=False)
    cat_df.to_excel(writer, sheet_name="catchments", index=False)