In [1]:
import glob
import os
import shutil
import sqlite3

import altair as alt
import numpy as np
import pandas as pd

# alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

# Tiltaksovervakingen: opsjon for kvalitetskontroll av analysedata
## Notebook 2: Visualising overall parameter distributions

This notebook produces the plots [here](https://nivanorge.github.io/tiltaksovervakingen/pages/distribution_plots.html).

## 1. Read data

Read the tables from the tidied database.

In [2]:
# Connect to database
dbname = "kalk_data.db"
eng = sqlite3.connect(dbname, detect_types=sqlite3.PARSE_DECLTYPES)

In [3]:
# Read tables
stn_df = pd.read_sql("SELECT * FROM stations", eng)
par_df = pd.read_sql("SELECT * FROM parameters_units", eng)
wc_df = pd.read_sql("SELECT * FROM water_chemistry", eng)
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d %H:%M:%S")
wc_df["parameter_unit"] = wc_df["parameter"] + "_" + wc_df["unit"]

In [4]:
# Set axis scale for plots
ax_scale = "Linear"  # Or 'Log'

## 2. Build visualisation

In [5]:
# Build drop-down list
par_list = ["None"] + sorted(wc_df["parameter_unit"].unique())
input_dropdown = alt.binding_select(options=par_list)
selection = alt.selection_single(
    fields=["parameter_unit"], bind=input_dropdown, name="Select"
)

In [6]:
# Choose the 'new' lab of interest ('VestfoldLAB' or 'Eurofins')
lab = "Eurofins"

# Ticks
ticks = (
    alt.Chart(
        wc_df,
        height=150,
        width=450,
        title="Strip plot",
    )
    .add_selection(selection)
    .transform_filter(selection)
    .mark_tick(
        thickness=2,
        size=30,
        opacity=0.3,
    )
    .encode(
        x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y(
            "lab:N",
            title="",
            sort=[
                "NIVA (historic)",
                "VestfoldLAB (historic)",
                lab,
            ],
        ),
        color="lab:N",
        tooltip=[
            "vannmiljo_code:N",
            "sample_date:T",
            "lab:N",
            "parameter:N",
            "unit:N",
            "value:Q",
        ],
    )
    .interactive()
)

ticks.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

# Q-Q plot
base = alt.Chart(wc_df, height=300, width=450, title="Q-Q plot")

scatter = (
    base.transform_filter(selection)
    .transform_quantile(
        "value",
        step=0.05,
        as_=["percentile", "value"],
        groupby=["period"],
    )
    .transform_pivot("period", groupby=["percentile"], value="value")
    .mark_point()
    .encode(
        x=alt.X(
            "historic:Q", title="Historic data", scale=alt.Scale(type=ax_scale.lower())
        ),
        y=alt.Y("new:Q", title="New data", scale=alt.Scale(type=ax_scale.lower())),
        color=alt.Color("percentile:Q", scale=alt.Scale(scheme="turbo")),
        tooltip=["percentile:Q", "historic:Q", "new:Q"],
    )
    .interactive()
)

# 1:1 line
line = (
    base.transform_filter(selection)
    .transform_quantile(
        "value",
        step=0.05,
        as_=["percentile", "value"],
        groupby=["period"],
    )
    .transform_pivot("period", groupby=["percentile"], value="value")
    .mark_line()
    .encode(
        x=alt.X("historic:Q", title="", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y("historic:Q", title="", scale=alt.Scale(type=ax_scale.lower())),
    )
)

qq_plot = scatter + line
qq_plot.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

# KDE plot
kde = (
    alt.Chart(
        wc_df,
        height=160,
        width=450,
        title="Density plot",
    )
    .transform_filter(selection)
    .transform_density(
        density="value",
        groupby=["lab"],
    )
    .mark_area(
        opacity=0.3,
    )
    .encode(
        x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y("density:Q", title=""),
        color="lab:N",
        row=alt.Row(
            "lab:N",
            title="",
            sort=[
                "NIVA (historic)",
                "VestfoldLAB (historic)",
                "Eurofins",
            ],
        ),
    )
    .interactive()
)

kde.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

chart = (ticks & qq_plot) | kde
chart.save("distribution_plots.json")

In [7]:
# Move files to 'pages' folder
flist = glob.glob("*.json")
for fpath in flist:
    shutil.copy(fpath, "../../pages/")
    os.remove(fpath)

## 3. Summary

The final plots can be explored [here](https://nivanorge.github.io/tiltaksovervakingen/pages/distribution_plots.html). Note the following:

 * The distribution for alkalinity is very different between Eurofins and the historic dataset. The Eurofins values are substantially lower than previously
 
 * There are two large ANC outliers in the Eurofins dataset (-170 and -1000 uekv/l). Apart from these, ANC is generally slightly higher in the Eurofins data than previously
 
 * Concentrations for Ca, Cl, Mg and Na are all slightly lower in the Eurofins data than in the reference dataset. The differences are not large and the distributions shapes are broadly similar, but the Eurofins values are consistently slightly lower
 
 * Values for RAl and ILAl from Eurofins are both slightly higher than in the reference dataset. The situation is similar to - but less extreme than - the results from VestfoldLAB in 2019, except in this case RAl and ILAl increase by approximately the same amount, so the distribution for LAl is broadly the same as in the reference data (see also the more detailed exploration in notebook 05)
 
 * Conductivity reported by Eurofins is consistently slightly lower than in the reference data
 
 * Total N reported by Eurofins is consistently slightly lower than in the reference data
 
 * With the exception of one very large outlier (58 mg/l), SO4 reported by Eurofins is consistently slightly low compared to the reference dataset
 
 * TOC reported by Eurofins is consistently slightly higher than in the reference data, although the difference is small
 
The main issue highlighted above is the small number of **obvious outliers**, which should be either reanalysed or removed. The distributions for **alkalinity** are also substantially different and should be investigated further; other distributional differences are probably small enough to be ignored.