In [None]:
%matplotlib inline
import os
import sys

sys.path.append("../..")

import matplotlib.pyplot as plt
import nivapy3 as nivapy
import numpy as np
import pandas as pd
import seaborn as sn
import useful_rid_code as rid
from sqlalchemy import text

sn.set_context("notebook")

In [None]:
# Connect to db
engine = nivapy.da.connect()

In [None]:
# Year of interest
year = 2023

# RID 2023-24: data processing notebook

## 1. Add 2023 datasets

### 1.1. Update flow datasets

The notebook `update_flow_nve_hydapi.ipynb` can be used to update flow datasets in RESA2. Note that not all datasets are necessarily available via HydAPI, so it is still sometimes necessary to request datasets directly from Trine at NVE.

### 1.2. Water chemistry quality control

I Liv Bente has quality-checked the data in RESA and the necessary corrections have been made (see e-mail from Liv Bente received 23-08-2024).

### 1.3. Sample selections

Previous analysis for the RID report only used water samples collected as part of the "core" monitoring programme (i.e. not flood samples or those collected under Option 3). For 2021-5, the option 3 samples have been moved to a separate project called Bk-stations, so not sure whether this is still relevant?

In [None]:
# Read new site groupings (for 2017 to 2020)
in_xlsx = r"../../../data/RID_Sites_List_2017-2020.xlsx"
rid_20_df = pd.read_excel(in_xlsx, sheet_name="RID_20")
rid_135_df = pd.read_excel(in_xlsx, sheet_name="RID_135")
rid_155_df = pd.read_excel(in_xlsx, sheet_name="RID_All")

#### 1.3.1. Option 3/Bk-stations

In the programme for 2017-20, additional samples were collected under "Option 3". In the programme for 2021-25, this sampling takes place as an entirely separate project, called the "Bk-stasjoner" (RESA project ID 4591). Samples from these stations will be treated the same as the Option 3 samples previously (i.e. added to `sample_selection 65`).

**For 2023, there is no overlap between the RID_155 and the sampled BK_stations.**

In [None]:
# Get Bk-stations
bk_df = nivapy.da.select_resa_project_stations([4591], engine)
print(len(bk_df), "stations in the Bk-project.")

# Find any Bk-stations also in the "main" project
bk_in_155 = set(rid_155_df["station_id"]).intersection(set(bk_df["station_id"]))
print("The following Bk-stations are also part of the RID 155.")
bk_in_155 = rid_155_df.query("station_id in @bk_in_155")
bk_in_155

In [None]:
# # Get data from Bk-stations in RID 155 for year of interest
# bk_par_df = nivapy.da.select_resa_station_parameters(
#     bk_in_155, f"{year}-01-01", f"{year}-12-31", engine
# )
# bk_wc_df, bk_dup_df = nivapy.da.select_resa_water_chemistry(
#     bk_in_155, bk_par_df, f"{year}-01-01", f"{year}-12-31", engine
# )
# print(len(bk_wc_df), "samples to be linked to 'Option 3'.")
# bk_wc_df.head()

In [None]:
# # Add samples from Bk-stations also in RID 155 to "option 3"
# ws_ids = []
# for idx, row in bk_wc_df.iterrows():
#     sql = (
#         "SELECT water_sample_id FROM resa2.water_samples "
#         "WHERE station_id = %s "
#         "AND TRUNC(sample_date) = DATE '%s' "
#         "AND depth1 = %s "
#         "AND depth2 = %s"
#         % (
#             row["station_id"],
#             row["sample_date"].strftime("%Y-%m-%d"),
#             row["depth1"],
#             row["depth2"],
#         )
#     )
#     ws_id = engine.execute(sql).fetchall()[0]
#     assert len(ws_id) == 1
#     ws_id = ws_id[0]
#     ws_ids.append(ws_id)

# ws_df = pd.DataFrame(
#     {
#         "water_sample_id": ws_ids,
#     }
# )
# ws_df["sample_selection_id"] = 65

# assert len(bk_wc_df) == len(ws_df)

# # ws_df.to_sql(
# #     "sample_selections", con=engine, schema="resa2", if_exists="append", index=False
# # )

#### 1.3.2. Flood samples

Flood samples were taken during Storm Hans in August on Glomma and Drammenselva.

In [None]:
# Get WS IDs for flood samples
fl_xlsx = f"../../../data/flood_samples_{year}.xlsx"
fl_df = pd.read_excel(fl_xlsx, sheet_name=f"flood_samples_{year}")

ws_ids = []
with engine.connect() as connection:
    for idx, row in fl_df.iterrows():
        sql = text(
            "SELECT water_sample_id FROM resa2.water_samples "
            "WHERE station_id = :station_id "
            "AND TO_CHAR(sample_date, 'YYYY-MM-DD HH24:MI') = :sample_date "
            "AND depth1 = :depth1 "
            "AND depth2 = :depth2"
        )
        params = {
            "station_id": row["station_id"],
            "sample_date": row["sample_date"].strftime("%Y-%m-%d %H:%M"),
            "depth1": row["depth1"],
            "depth2": row["depth2"],
        }
        result = connection.execute(sql, params).fetchall()
        assert len(result) == 1
        ws_id = result[0][0]
        ws_ids.append(ws_id)

ws_df = pd.DataFrame(
    {
        "water_sample_id": ws_ids,
    }
)
ws_df["sample_selection_id"] = 64

assert len(fl_df) == len(ws_df)

# ws_df.to_sql(
#     "sample_selections", con=engine, schema="resa2", if_exists="append", index=False
# )

#### 1.3.3. Main programme

Everything else (i.e. not Option 3 or flood) is assumed to be part of the main programme.

In [None]:
# Get flood and Option 3 samples
sql = (
    "SELECT water_sample_id "
    "FROM resa2.sample_selections "
    "WHERE sample_selection_id IN (64, 65)"
)
oth_ws = pd.read_sql_query(sql, engine)
assert oth_ws["water_sample_id"].is_unique

# Get all WS associated with core sites
sql = (
    "SELECT water_sample_id FROM resa2.water_samples "
    "WHERE station_id IN %s "
    "AND sample_date >= DATE '%s-01-01' "
    "AND sample_date < DATE '%s-01-01'"
    % (str(tuple(rid_155_df["station_id"].astype(int))), year, year + 1)
)
all_ws = pd.read_sql_query(sql, engine)
assert all_ws["water_sample_id"].is_unique

# Remove flood and option 3 samples from core
core_ws = set(all_ws["water_sample_id"]) - set(oth_ws["water_sample_id"])

# Add to sample selections
core_df = pd.DataFrame({"water_sample_id": list(core_ws)})
core_df["sample_selection_id"] = 63

print(len(core_df), "samples in the main programme.")

# core_df.to_sql(
#     "sample_selections", con=engine, schema="resa2", if_exists="append", index=False
# )

## 2. Tabulate raw water chemistry and flow

### 2.1. Data for year of interest

From 2017 onwards, water chemistry samples have been collected at 20 sites (`RID_20`). In 2018, the station TROEMÅL2 was added to the RID_20 selection, making 21 stations in total. Both TROEMÅL and TROEMÅL2 were monitored in 2018, but from 2019 onwards TROEMÅL2 replaced TROEMÅL in the main programme (although TROEMÅL is sometimes still included as part of Option 3).

The data are exported to CSV format below.

**Added 19.05.2022:** For 2021-5, I have added an extra kwarg named `extract_flow` to the function below. This is because flow data are not ready for the new reporting deadlines in June, so we need to be able to process the concentration part without flows.

In [None]:
# Whether to get flow data. Set to False for spring processing and True for autumn processing
extract_flow = True

In [None]:
# Output CSV
out_csv = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/concs_and_flows_rid_20_{year}.csv"
df = rid.write_csv_water_chem(
    rid_20_df, year, out_csv, engine, samp_sel=63, extract_flow=extract_flow
)

### 2.2. Data for all years

**Added 03.09.2018**. From 2017, we include estimates of trends in the "main" rivers for both loads and concentrations for the period from 1990 to present.

**Added 19.05.2022**. Set `extract_flow` to `False` for the spring processing and `True` for the autumn processing.

In [None]:
%%capture

# Container for data
df_list = []

# Dummy path for intermediate output (which isn't needed here)
out_csv = r"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/cons_and_flows_intermed.csv"

# Loop over years
for data_yr in range(1990, year + 1):
    # Get data
    df = rid.write_csv_water_chem(
        rid_20_df, data_yr, out_csv, engine, samp_sel=63, extract_flow=extract_flow
    )

    # Add to output
    df_list.append(df)

# Delete intermediate
os.remove(out_csv)

# Combine
df = pd.concat(df_list, axis=0)

# Reorder cols and tidy
st_cols = [
    "station_id",
    "station_code",
    "station_name",
    "old_rid_group",
    "new_rid_group",
    "ospar_region",
    "sample_date",
]
if extract_flow:
    st_cols.append("Qs_m3/s")

par_cols = [i for i in df.columns if i not in st_cols]
par_cols.sort()
df = df[st_cols + par_cols]

# Output CSV
out_csv = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/concs_and_flows_rid_20_1990-{year}.csv"
df.to_csv(out_csv, encoding="utf-8", index=False)

## 3. Estimate observed loads

### 3.1. Annual flows

First get a dataframe of annual flow volumes to join to the summary output. **NB:** This dataframe isn't actually used in the loads calculations - they are handled separately - it's just for the output CSVs.

In [None]:
# Sites of interest: combine all site dfs into one
rid_all_df = pd.concat([rid_20_df, rid_135_df], axis=0)

# Get flow data
q_df = rid.get_flow_volumes(rid_all_df, 1990, year, engine)

q_df.head()

### 3.2. Loads for all rivers

The code below is taken from Section 2 of [notebook 3](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/estimate_loads.ipynb). Loads are calculated directly from contemporary observations for the RID_20, and they are inferred from historic concentrations for the RID_135 sites.

As above, note the use of the `'samp_sel'` argument in the code below.

In [None]:
# Sites of interest: combine all site dfs into one
rid_all_df = pd.concat([rid_20_df, rid_135_df], axis=0)

# Pars of interest
par_list = [
    "SPM",
    "TOC",
    "PO4-P",
    "TOTP",
    "NO3-N",
    "NH4-N",
    "TOTN",
    "SiO2",
    "Ag",
    "As",
    "Pb",
    "Cd",
    "Cu",
    "Zn",
    "Ni",
    "Cr",
    "Hg",
]

# Container for results from each site
loads_list = []

# Loop over sites
for stn_id in rid_all_df["station_id"].values:
    # Estimate loads at this site
    loads_list.append(
        rid.estimate_loads(
            stn_id, par_list, year, engine, infer_missing=True, samp_sel=63
        )
    )

# Concatenate to new df
lds_all = pd.concat(loads_list, axis=0)
lds_all.index.name = "station_id"
lds_all.reset_index(inplace=True)

# Get flow data for year
q_yr = q_df.query("year == @year")

# Join
lds_all = pd.merge(lds_all, rid_all_df, how="left", on="station_id")
lds_all = pd.merge(lds_all, q_yr, how="left", on="station_id")

# Reorder cols and tidy
st_cols = [
    "station_id",
    "station_code",
    "station_name",
    "old_rid_group",
    "new_rid_group",
    "ospar_region",
    "mean_q_1000m3/day",
]
unwant_cols = [
    "nve_vassdrag_nr",
    "lat",
    "lon",
    "utm_north",
    "utm_east",
    "utm_zone",
    "station_type",
    "year",
]
par_cols = [i for i in lds_all.columns if i not in (st_cols + unwant_cols)]

for col in unwant_cols:
    del lds_all[col]

lds_all = lds_all[st_cols + par_cols]

# Write output
out_csv = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/loads_and_flows_all_sites_{year}.csv"
lds_all.to_csv(out_csv, encoding="utf-8", index=False)

### 3.3. Loads for the RID_20 rivers through time

The code below is taken from Section 3 of [notebook 3](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/estimate_loads.ipynb).

Note the use of the `'samp_sel'` argument in the code below.

In [None]:
# Period of interest
st_yr, end_yr = 1990, year

# Container for results
loads_list = []

# Loop over sites
for stn_id in rid_20_df["station_id"].values:
    # Loop over years
    for data_yr in range(st_yr, end_yr + 1):
        print("Processing Station ID %s for %s" % (stn_id, data_yr))

        # Get loads
        l_df = rid.estimate_loads(
            stn_id, par_list, data_yr, engine, infer_missing=True, samp_sel=63
        )

        if l_df is not None:
            # Name and reset index
            l_df.index.name = "station_id"
            l_df.reset_index(inplace=True)

            # Add year
            l_df["year"] = data_yr

            # Add to outout
            loads_list.append(l_df)

# Concatenate to new df
lds_ts = pd.concat(loads_list, axis=0)

# Join
lds_q_ts = pd.merge(lds_ts, rid_20_df, how="left", on="station_id")
lds_q_ts = pd.merge(lds_q_ts, q_df, how="left", on=["station_id", "year"])

# Reorder cols and tidy
st_cols = [
    "station_id",
    "station_code",
    "station_name",
    "old_rid_group",
    "new_rid_group",
    "ospar_region",
    "mean_q_1000m3/day",
]
unwant_cols = [
    "nve_vassdrag_nr",
    "lat",
    "lon",
    "utm_north",
    "utm_east",
    "utm_zone",
    "station_type",
]
par_cols = [i for i in lds_q_ts.columns if i not in (st_cols + unwant_cols)]

for col in unwant_cols:
    del lds_q_ts[col]

lds_q_ts = lds_q_ts[st_cols + par_cols]

# Save output
out_csv = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/loads_and_flows_rid_20_{st_yr}-{end_yr}.csv"
lds_q_ts.to_csv(out_csv, encoding="utf-8", index=False)

# Build multi-index on lds_ts for further processing
lds_ts.set_index(["station_id", "year"], inplace=True)

In [None]:
%%capture
# This code cell produces lots of Deprecation Warnings from Seaborn/Pandas.
# %%capture suppresses all output from this cell to keep things tidy

# Output folder for plots
out_fold = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/ts_plots/rid_plots_to_{year}"
if not os.path.isdir(out_fold):
    os.mkdir(out_fold)

# Loop over df
for stn_id in rid_20_df["station_id"].values:
    # Get data for this station
    df = lds_ts.loc[stn_id]

    # Separate est and val cols to two dfs
    cols = df.columns
    est_cols = [i for i in cols if i.split("_")[1] == "Est"]
    val_cols = [i for i in cols if i.split("_")[1] != "Est"]
    val_df = df[val_cols]
    est_df = df[est_cols]

    # Convert to "long" format
    val_df.reset_index(inplace=True)
    val_df = pd.melt(val_df, id_vars="year", var_name="par_unit")
    est_df.reset_index(inplace=True)
    est_df = pd.melt(est_df, id_vars="year", var_name="par_est", value_name="est")

    # Get just par for joining
    val_df["par"] = val_df["par_unit"].str.split("_", expand=True)[0]
    est_df["par"] = est_df["par_est"].str.split("_", expand=True)[0]

    # Join
    df = pd.merge(val_df, est_df, how="left", on=["year", "par"])

    # Extract cols of interest
    df = df[["year", "par_unit", "value", "est"]]

    # Plot
    g = sn.catplot(
        x="year",
        y="value",
        hue="est",
        col="par_unit",
        col_wrap=3,
        data=df,
        kind="bar",
        dodge=False,
        sharex=False,
        sharey=False,
        alpha=0.5,
        aspect=2,
        legend=False,
    )

    # Rotate tick labels and tidy
    for ax in g.axes.flatten():
        for tick in ax.get_xticklabels():
            tick.set(rotation=45)
    plt.tight_layout()

    # Save
    out_path = os.path.join(out_fold, f"{stn_id}.png")
    plt.savefig(out_path, dpi=200)
    plt.close()

The three files created above (`concs_and_flows_rid_11-36_{year}.csv`, `loads_and_flows_all_sites_{year}.csv` and `loads_and_flows_rid_11_1990-{year}.csv`) can now be imported into Excel and send to NIBIO. The data layout is illustrated here:

C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet\Results\Loads_CSVs\rid_conc_and_loads_summaries_2016.xlsx

**NB:** For neatness, a couple of columns can be manually reordered so that the "flag" columns always come before the data columns.

## 4. Generate output tables for Word

### 4.1. Table 1: Raw water chemistry

The code below is based on Section 2 of [notebook 5](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/word_data_tables.ipynb).

**Updated 24/09/2018**

This function has been modified to refelect changes in the 2017-20 monitoring programme:

 1. The Word template now has pages for just the 20 "main" rivers, not the 11 + 36 rivers, as previously <br><br>
 
 2. Four new columns have been added for new parameters measured during 2017-20 (DOC, Part. C, Tot. Part. N and TDP) <br><br>
 
 3. Hours and minutes have been removed from the date-time column to create space for the new columns <br><br>
 
 4. I have corrected various typos in the database (and in the template):
 
     * `'Tot.part. N'` > `'Tot. Part. N'`
     * `'Vosso(Bolstadelvi)'` > `'Vosso (Bolstadelvi)'`
     * `'Nidelva(Tr.heim)'` > `'Nidelva (Tr.heim)'`
     * `'More than 70%LOD'` > `'More than 70% >LOD'` (template only) 
     
**Updated 25.08.2020**

For the 2019 data, "Målselv" has been replaced by a new station downstream, "Målselv v/gml E6-brua" (see e-mail from Øyvind receievd 18.08.2020 at 08:09 for details). I have updated the templates to reflect this.


**Updated 19.05.2022**

Flow data are not available for the new reporting deadline in June. Flow has therefore been removed from the template for 2021 onwards.

In [None]:
tab_path = rid.copy_word_template(1, year)
rid.write_word_water_chem_tables(
    rid_20_df, year, tab_path, engine, samp_sel=63, extract_flow=False
)

### 4.2. Table 2: Estimated loads at each site

The code below is based on Section 3 of [notebook 5](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/word_data_tables.ipynb).

**Updated 24/09/2018**

For the 2017-20 programme, we will only report loads for the 20 "main" rivers, not all 155. I have therefore simplified the Word template by deleting unnecessary rows. The function itself is unchanged.

In [None]:
tab_path = rid.copy_word_template(2, year)
loads_csv = f"/home/jovyan/shared/common/elveovervakingsprogrammet/results/measured_loads/loads_and_flows_all_sites_{year}.csv"

# Drop Målselv as no longer monitored in main programme
stn_df = rid_20_df.query("station_name != 'Målselv'")

rid.write_word_loads_table(stn_df, loads_csv, tab_path, engine)