In [1]:
import glob
import os
import shutil
import sqlite3

import altair as alt
import pandas as pd

# alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

# Tiltaksovervakingen: opsjon for kvalitetskontroll av analysedata
# Eurofins 2024 Q3

## Notebook 2: Visualising overall parameter distributions

In [2]:
# Choose dataset to process
lab = "Eurofins"
year = 2024
qtr = 3
version = 1

## 1. Read data

Read the tables from the tidied database.

In [3]:
# Connect to database
fold_path = f"../../output/{lab.lower()}_{year}_q{qtr}_v{version}"
db_path = os.path.join(fold_path, "kalk_data.db")
eng = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES)

In [4]:
# Read tables
stn_df = pd.read_sql("SELECT * FROM stations", eng)
par_df = pd.read_sql("SELECT * FROM parameters_units", eng)
wc_df = pd.read_sql("SELECT * FROM water_chemistry", eng)
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d %H:%M:%S")
wc_df["parameter_unit"] = wc_df["parameter"] + "_" + wc_df["unit"]

In [5]:
# Set axis scale for plots
ax_scale = "Linear"  # Or 'Log'

In [6]:
# # Subset data to just the quarter of interest
# qtr = "q1"

# months_dict = {
#     "q1": [1, 2, 3],
#     "q2": [4, 5, 6],
#     "q3": [7, 8, 9],
#     "q4": [10, 11, 12],
# }
# months = months_dict[qtr]
# wc_df = wc_df[wc_df["sample_date"].dt.month.isin(months)]

## 2. Build visualisation

In [7]:
# Build drop-down list
par_list = ["None"] + sorted(wc_df["parameter_unit"].unique())
input_dropdown = alt.binding_select(options=par_list)
selection = alt.selection_single(
    fields=["parameter_unit"], bind=input_dropdown, name="Select"
)

In [8]:
wc_df.head()

Unnamed: 0,vannmiljo_code,sample_date,lab,period,depth1,depth2,parameter,flag,value,unit,parameter_unit
0,019-44498,2012-01-02,NIVA (historic),historic,0.0,0.0,K,=,0.19,mg/l,K_mg/l
1,019-44498,2012-01-02,NIVA (historic),historic,0.0,0.0,KOND,=,1.71,mS/m,KOND_mS/m
2,019-44498,2012-02-15,NIVA (historic),historic,0.0,0.0,CL,=,1.15,mg/l,CL_mg/l
3,019-44498,2012-02-15,NIVA (historic),historic,0.0,0.0,KOND,=,1.4,mS/m,KOND_mS/m
4,019-44498,2012-03-05,NIVA (historic),historic,0.0,0.0,CL,=,1.83,mg/l,CL_mg/l


In [9]:
wc_df.tail()

Unnamed: 0,vannmiljo_code,sample_date,lab,period,depth1,depth2,parameter,flag,value,unit,parameter_unit
197441,067-79150,2024-08-06,Eurofins,new,0.0,0.0,ANC,,50.0,µekv/l,ANC_µekv/l
197442,067-79150,2024-09-02,Eurofins,new,0.0,0.0,ANC,,35.0,µekv/l,ANC_µekv/l
197443,067-40687,2024-07-02,Eurofins,new,0.0,0.0,ANC,,31.0,µekv/l,ANC_µekv/l
197444,067-40687,2024-08-06,Eurofins,new,0.0,0.0,ANC,,34.0,µekv/l,ANC_µekv/l
197445,067-40687,2024-09-02,Eurofins,new,0.0,0.0,ANC,,28.0,µekv/l,ANC_µekv/l


In [10]:
# Ticks
ticks = (
    alt.Chart(
        wc_df,
        height=150,
        width=450,
        title="Strip plot",
    )
    .add_selection(selection)
    .transform_filter(selection)
    .mark_tick(
        thickness=2,
        size=30,
        opacity=0.3,
    )
    .encode(
        x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y(
            "lab:N",
            title="",
            sort=[
                "NIVA (historic)",
                "VestfoldLAB (historic)",
                "Eurofins (historic)",
                lab,
            ],
        ),
        color="lab:N",
        tooltip=[
            "vannmiljo_code:N",
            "sample_date:T",
            "lab:N",
            "parameter:N",
            "unit:N",
            "value:Q",
        ],
    )
    .interactive()
)

ticks.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

# Q-Q plot
base = alt.Chart(wc_df, height=300, width=450, title="Q-Q plot")

scatter = (
    base.transform_filter(selection)
    .transform_quantile(
        "value",
        step=0.05,
        as_=["percentile", "value"],
        groupby=["period"],
    )
    .transform_pivot("period", groupby=["percentile"], value="value")
    .mark_point()
    .encode(
        x=alt.X(
            "historic:Q", title="Historic data", scale=alt.Scale(type=ax_scale.lower())
        ),
        y=alt.Y("new:Q", title="New data", scale=alt.Scale(type=ax_scale.lower())),
        color=alt.Color("percentile:Q", scale=alt.Scale(scheme="turbo")),
        tooltip=["percentile:Q", "historic:Q", "new:Q"],
    )
    .interactive()
)

# 1:1 line
line = (
    base.transform_filter(selection)
    .transform_quantile(
        "value",
        step=0.05,
        as_=["percentile", "value"],
        groupby=["period"],
    )
    .transform_pivot("period", groupby=["percentile"], value="value")
    .mark_line()
    .encode(
        x=alt.X("historic:Q", title="", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y("historic:Q", title="", scale=alt.Scale(type=ax_scale.lower())),
    )
)

qq_plot = scatter + line
qq_plot.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

# KDE plot
kde = (
    alt.Chart(
        wc_df,
        height=160,
        width=450,
        title="Density plot",
    )
    .transform_filter(selection)
    .transform_density(
        density="value",
        groupby=["lab"],
    )
    .mark_area(
        opacity=0.3,
    )
    .encode(
        x=alt.X("value:Q", title="Value", scale=alt.Scale(type=ax_scale.lower())),
        y=alt.Y("density:Q", title=""),
        color="lab:N",
        row=alt.Row(
            "lab:N",
            title="",
            sort=[
                "NIVA (historic)",
                "VestfoldLAB (historic)",
                "Eurofins (historic)",
                "Eurofins",
            ],
        ),
    )
    .interactive()
)

kde.configure_axis(
    labelFontSize=16,
    titleFontSize=20,
).configure_legend(labelFontSize=16)

chart = (ticks & qq_plot) | kde
chart.save("distribution_plots.json")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [11]:
# chart

In [12]:
# Move json files to 'output' folder
flist = glob.glob("*.json")
for fpath in flist:
    shutil.copy(fpath, fold_path)
    os.remove(fpath)

# Copy HTML page too
shutil.copy(r"../../pages/distribution_plots_vegalite5.html", fold_path)

'../../output/eurofins_2024_q3_v1/distribution_plots_vegalite5.html'