In [1]:
import glob
import os
import shutil
import sqlite3

import altair as alt
import numpy as np
import pandas as pd

#alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

# Tiltaksovervakingen: opsjon for kvalitetskontroll av analysedata
## Notebook 2: Visualising overall parameter distributions

This notebook produces the plots [here](https://nivanorge.github.io/tiltaksovervakingen/pages/distribution_plots.html).

## 1. Read data

Read the tables from the tidied database.

In [2]:
# Connect to database
dbname = "kalk_data.db"
eng = sqlite3.connect(dbname, detect_types=sqlite3.PARSE_DECLTYPES)

In [3]:
# Read tables
stn_df = pd.read_sql("SELECT * FROM stations", eng)
par_df = pd.read_sql("SELECT * FROM parameters_units", eng)
wc_df = pd.read_sql("SELECT * FROM water_chemistry", eng)
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d %H:%M:%S")
wc_df["parameter_unit"] = wc_df["parameter"] + "_" + wc_df["unit"]

In [4]:
# Set axis scale for plots
ax_scale = "Linear" # Or 'Log'

## 2. Build visualisation

In [5]:
# Build drop-down list
par_list = ['None'] + sorted(wc_df["parameter_unit"].unique())
input_dropdown = alt.binding_select(options=par_list)
selection = alt.selection_single(
    fields=["parameter_unit"], bind=input_dropdown, name="Select"
)

In [6]:
# Ticks
ticks = alt.Chart(wc_df, height=150, width=450, title="Strip plot",
).add_selection(
    selection
).transform_filter(
    selection
).mark_tick(thickness=2, size=30, opacity=0.3,
).encode(
    x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
    y=alt.Y(
        "lab:N",
        title="",
        sort=[
            "NIVA (historic)",
            "VestfoldLAB (historic)",
            "VestfoldLAB (2020)",
        ],
        ),
        color="lab:N",
        tooltip=[
            "vannmiljo_code:N",
            "sample_date:T",
            "lab:N",
            "parameter:N",
            "unit:N",
            "value:Q",
        ],
).interactive()

ticks.configure_axis(labelFontSize=16, titleFontSize=20,
).configure_legend(labelFontSize=16)

# Q-Q plot
base = alt.Chart(wc_df, height=300, width=450, title="Q-Q plot")

scatter = base.transform_filter(
    selection
).transform_quantile(
    'value',
    step=0.05,
    as_=['percentile', 'value'],
    groupby=['period'],
).transform_pivot(
    'period',
    groupby=['percentile'],
    value='value'
).mark_point().encode(
    x=alt.X('historic:Q', title="Historic data", scale=alt.Scale(type=ax_scale.lower())),
    y=alt.Y('new:Q', title="New data", scale=alt.Scale(type=ax_scale.lower())),
    color=alt.Color("percentile:Q", scale=alt.Scale(scheme="turbo")),
    tooltip=["percentile:Q", "historic:Q", "new:Q"],
).interactive()

# 1:1 line
line = base.transform_filter(
    selection
).transform_quantile(
    'value',
    step=0.05,
    as_=['percentile', 'value'],
    groupby=['period'],
).transform_pivot(
    'period',
    groupby=['percentile'],
    value='value'
).mark_line().encode(
    x=alt.X('historic:Q', title="", scale=alt.Scale(type=ax_scale.lower())),
    y=alt.Y('historic:Q', title="", scale=alt.Scale(type=ax_scale.lower())),
)

qq_plot = (scatter + line)
qq_plot.configure_axis(labelFontSize=16, titleFontSize=20,
).configure_legend(labelFontSize=16)

# KDE plot
kde = alt.Chart(wc_df, height=160, width=450, title="Density plot",
).transform_filter(
    selection
).transform_density(
    density="value",
    groupby=["lab"],
).mark_area(opacity=0.3,
).encode(
    x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
    y=alt.Y("density:Q", title=""),
    color="lab:N",
    row=alt.Row(
        "lab:N",
        title="",
        sort=[
            "NIVA (historic)",
            "VestfoldLAB (historic)",
            "VestfoldLAB (2020)",
        ],
    ),
).interactive()

kde.configure_axis(labelFontSize=16, titleFontSize=20,
).configure_legend(labelFontSize=16)

chart = (ticks & qq_plot) | kde
chart.save('distribution_plots.json')

In [7]:
# Move files to 'pages' folder
flist = glob.glob("*.json")
for fpath in flist:
    shutil.copy(fpath, "../pages/")
    os.remove(fpath)

## 3. Summary

The final plots can be explored [here](https://nivanorge.github.io/tiltaksovervakingen/pages/distribution_plots.html). Note the following:

 * Many of the biggest "outliers" are actually in the historic data from Vannmiljø. Whether these are actually errors or just genuine "extreme" results, I do not know
 
 * The following patterns should be checked more closely:
 
   * Values of `ALK` > 1 mmol/l reported by NIVA (2012-15)
   * Values of `ANC` > 1000 µekv/l reported by NIVA (2012-15)
   * Values of `CA` > 100 mg/l reported by NIVA (2012-15)
   * Values of `CL` > 30 mg/l reported by both NIVA (2012-15) and VestfoldLAB (2016-19)
   * Values of `KOND` > 100 mS/m reported by NIVA (2012-15)
   * Values of `K` > 2 mg/l reported by VestfoldLAB (2016-19)
   * Values of `MG` > 10 mg/l reported by NIVA (2012-15)
   * Values of `N-TOT` > 2000 µg/l N reported by VestfoldLAB in both 2016-19 and 2020
   * Values of `NA` > 10 mg/l reported by both NIVA (2012-15) and VestfoldLAB (2016-19)
   * Values of `P-TOT` > 100 µg/l P reported by both NIVA (2012-15) and VestfoldLAB (2016-19 and 2020). In particular, a value of 1200 µg/l P was reported by VestfoldLAB in 2020
   * Values of `PH` < 4 reported by NIVA (2012-15)
   * Values of `PH` > 9.5 reported by VestfoldLAB (2016-19)
   * Values of `RAL` > 250 µg/l Al reported by both NIVA (2012-15) and VestfoldLAB (2016-19)
   * **All** `SIO2` data reported by VestfoldLAB to Vannmiljø during 2016-19 (see below)
   * Values of `SO4` > 5 mg/l reported by both NIVA (2012-15) and VestfoldLAB (2016-19)
   
 
 * `N-NO3` from VestfoldLAB (2016-19) is missing from the data export. I suspect - but have not had chance to check - that VestfoldLAB have reported (nitrate + nitrite) rather than just nitrate. If this is correct, we should clarify whether the current values are actually `N-NO3` (as stated in the spreadsheet) or if they include nitrite too
 
 * The `SIO2` data reported by VestfoldLAB to Vannmiljø looks suspicious. According to Vannmiljø, the mean value of `SIO2` reported by VestfoldLAB between 2016 and 2019 was 1.5 ***µg/l Si***, whereas the mean reported by NIVA between 2012 and 2015 was 650 µg/l Si. In the 2020 data from VestfoldLAB, the mean value for `SIO2` is 1.3 ***mg/l SiO2***. I therefore strongly suspect the `SIO2` values in Vannmiljø from VestfoldLAB have been mistakenly reported in mg/l SiO2 rather than µg/l Si. **In other words, the `SIO2` data from VestfoldLAB in Vannmiljø is a factor of 467.54 too small**
 
 * The zero values reported by VestfoldLAB look strange for some parameters