In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings

# warnings.filterwarnings("ignore")
plt.style.use("ggplot")

# ICPW Thematic Report 2020 - Nitrogen (Part 3: Annual trends)

Implementing trends calculations using site sele3ctions and time periods outlined by Kari in e+mail received 15.04.2020 at 16.35.

## 1. Read raw data

In [2]:
# Read stations
stn_path = r"../data/all_icpw_sites_may_2019.xlsx"
stn_df = pd.read_excel(stn_path, sheet_name="all_icpw_stns")

# Check stn numbers seem OK
trend_df = stn_df.query("group in ('Trends', 'Trends+Core')")
core_df = stn_df.query("group in ('Core', 'Trends+Core')")

print(f"There are {len(stn_df)} unique stations within the ICPW project as a whole.")
stn_df.head()

There are 556 unique stations within the ICPW project as a whole.


Unnamed: 0,station_id,station_code,station_name,latitude,longitude,altitude,continent,country,region,group
0,38115,Tr18_CA_DO1,Blue Chalk Lake,45.1999,-78.9432,344.0,North America,Canada,Ont,Trends
1,38116,Tr18_CA_DO2,Chub Lake,45.2138,-78.9836,343.0,North America,Canada,Ont,Trends
2,38117,Tr18_CA_DO3,Crosson Lake,45.084,-79.036,371.0,North America,Canada,Ont,Trends
3,38118,Tr18_CA_DO4,Dickie Lake,45.151,-79.0876,379.0,North America,Canada,Ont,Trends
4,38119,Tr18_CA_DO5,Harp Lake,45.3798,-79.1335,327.0,North America,Canada,Ont,Trends


In [3]:
# Read saved chem data
csv_path = r"./results/thematic_n_report_2020_working_data.csv"
wc_df = pd.read_csv(csv_path, encoding="utf-8")
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d")
wc_df.head()

Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,CH03,Lago di Tomè,1990-10-08,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,CH03,Lago di Tomè,1993-08-18,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,CH03,Lago di Tomè,1995-09-05,0.0,0.0,0.0,510.0,,,,,,,,
3,23472,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,450.0,,,,,,,,
4,23472,CH03,Lago di Tomè,1997-08-05,0.0,0.0,10.0,330.0,,,,,,,,


## 2. Re-calculate TON and parameter ratios

Ignoring NH4.

In [4]:
# TON
wc_df["TON_µg/l N"] = wc_df["TOTN_µg/l N"] - wc_df["NO3-N_µg/l N"]

# TOTN/TOTP
wc_df["TOTN/TOTP"] = wc_df["TOTN_µg/l N"] / wc_df["TOTP_µg/l P"]

# NO3/TOTP
wc_df["NO3/TOTP"] = wc_df["NO3-N_µg/l N"] / wc_df["TOTP_µg/l P"]

# TOC/TON
wc_df["TOC/TON"] = 1000 * wc_df["TOC_mg C/l"] / wc_df["TON_µg/l N"]

# TOC/TOTP
wc_df["TOC/TOTP"] = 1000 * wc_df["TOC_mg C/l"] / wc_df["TOTP_µg/l P"]

wc_df.head()

Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,CH03,Lago di Tomè,1990-10-08,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,CH03,Lago di Tomè,1993-08-18,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,CH03,Lago di Tomè,1995-09-05,0.0,0.0,0.0,510.0,,,,,,,,
3,23472,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,450.0,,,,,,,,
4,23472,CH03,Lago di Tomè,1997-08-05,0.0,0.0,10.0,330.0,,,,,,,,


## 3. Aggregate to annual medians

In [5]:
# Annual medians by station
wc_df["year"] = wc_df["sample_date"].dt.year
ann_df = wc_df.groupby(["station_id", "year"]).median().reset_index()

ann_df.head()

Unnamed: 0,station_id,year,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,1990,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,1993,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,1995,0.0,0.0,0.0,480.0,,,,,,,,
3,23472,1997,0.0,0.0,10.0,350.0,,,,,,,,
4,23472,2000,0.0,0.0,23.0,506.262169,,764.731,,258.468831,,,,


## 4. Subset to stations and time periods

Note that the criterion for "Proportion of total years with data" has been relaxed to 0.5, and we're only interested in start years 1992 and 2000.

In [6]:
# Melt to long format
df = ann_df.copy()
del df["depth1"], df["depth2"]
df = pd.melt(df, id_vars=["station_id", "year"])
df.dropna(how="any", inplace=True)
df.head()

Unnamed: 0,station_id,year,variable,value
0,23472,1990,NH4-N_µg/l N,20.0
1,23472,1993,NH4-N_µg/l N,30.0
2,23472,1995,NH4-N_µg/l N,0.0
3,23472,1997,NH4-N_µg/l N,10.0
4,23472,2000,NH4-N_µg/l N,23.0


In [7]:
# Define selection criteria
n_start_thresh = 1  # Number of annual values in first 5 years
n_end_thresh = 1  # Number of annual values in last 5 years
prop_thresh = 0.5  # Proportion of total years with data
st_yrs = [1992, 2000]  # Period of interest

# Dict for results
inc_dict = {
    "station_id": [],
    "variable": [],
    "st_yr": [],
    "include": [],
}

# Loop over time series
for stn_id in df["station_id"].unique():
    # Loop over variables
    for par in df["variable"].unique():  
        # Get data
        stn_par_df = df.query("(station_id == @stn_id) and (variable == @par)")
        stn_par_df.set_index("year", inplace=True)
        del stn_par_df["station_id"], stn_par_df["variable"]

        for st_yr in st_yrs:
            # Years from st_yr to 2016
            years = range(st_yr, 2017)
            n_yrs = 2017 - st_yr
            yrs_thresh = int((n_yrs * prop_thresh) + 0.5)
            years_df = pd.DataFrame(index=years)

            # Resample to annual
            stn_par_yr_df = years_df.join(stn_par_df)

            if pd.isna(stn_par_yr_df["value"]).all().all():
                # Not suitable
                inc_dict["station_id"].append(stn_id)
                inc_dict["variable"].append(par)
                inc_dict["st_yr"].append(st_yr)
                inc_dict["include"].append(0)

            else:
                n_start = pd.notnull(
                    stn_par_yr_df[stn_par_yr_df.index < (st_yr + 5)]["value"]
                ).sum()
                n_end = pd.notnull(
                    stn_par_yr_df[stn_par_yr_df.index > (2011)]["value"]
                ).sum()
                non_missing = pd.notnull(stn_par_yr_df["value"]).sum()

                if (
                    (n_start >= n_start_thresh)
                    and (n_end >= n_end_thresh)
                    and (non_missing >= yrs_thresh)
                ):
                    # Include
                    inc_dict["station_id"].append(stn_id)
                    inc_dict["variable"].append(par)
                    inc_dict["st_yr"].append(st_yr)
                    inc_dict["include"].append(1)

                else:
                    # Not suitable
                    inc_dict["station_id"].append(stn_id)
                    inc_dict["variable"].append(par)
                    inc_dict["st_yr"].append(st_yr)
                    inc_dict["include"].append(0)

# Build df
inc_df = pd.DataFrame(inc_dict)

# Calculate bool cols describing presence of *combinations* of params
# Unstack to 'wide'
inc_df.set_index(["station_id", "variable", "st_yr"], inplace=True)
inc_df = inc_df.unstack("variable")
inc_df.columns = inc_df.columns.get_level_values(1)
inc_df.reset_index(inplace=True)

# Are all params present?
inc_df["TOTN_NO3"] = ((inc_df["TOTN_µg/l N"] + inc_df["NO3-N_µg/l N"]) == 2).astype(int)

inc_df["TOC_TOTN_NO3"] = (
    (inc_df["TOC_mg C/l"] + inc_df["TOTN_µg/l N"] + inc_df["NO3-N_µg/l N"]) == 3
).astype(int)

inc_df["TOC_TOTN_NO3_NH4"] = (
    (
        inc_df["TOC_mg C/l"]
        + inc_df["TOTN_µg/l N"]
        + inc_df["NO3-N_µg/l N"]
        + inc_df["NH4-N_µg/l N"]
    )
    == 4
).astype(int)

# Melt back to 'long'
inc_df = inc_df.melt(id_vars=["station_id", "st_yr"], value_name="include",)

# Sum number of stations
nstns_df = inc_df.groupby(["variable", "st_yr"]).sum().reset_index()
nstns_df.rename({"include": "n_stns"}, inplace=True, axis="columns")
del nstns_df["station_id"]

## 5. Trend analyses

**Note:** At present, there is a lot of repeated copy-paste code in the sub-sections below. If the trend analysis remains the same for each of the three time-period-site-selection combinations, this should be re-written into functions to simplify the notebook. For now, I'm keeping these code blocks separate, as I assume each analysis may develop in a slightly different direction.

### 5.1. Stations with TOC, TOTN and NO3 from 1992 (with relaxed criteria for Italy)

From Kari's e-mail:

> Keep the start year 1992 and TOC as compulsory, but make an exception from the TOC criterion for Italy to have at least N trends from there. For the other countries I prefer to keep the TOC criterion, even if we lose a few sites, as it is so much easier to interpret the C and N trends when using the same set of sites

In [8]:
# Get list of stations
sel_stns = set(
    inc_df.query("(st_yr == 1992) and (variable == 'TOC_TOTN_NO3') and (include == 1)")[
        "station_id"
    ].values
)

# Get list of Italian station IDs with TOTN and NO3 from 1992
it_stns = stn_df.query("country == 'Italy'")["station_id"].values
totn_no3_stns = inc_df.query(
    "(st_yr == 1992) and (variable == 'TOTN_NO3') and (include == 1)"
)["station_id"].values
extra_it_stns = set(it_stns).intersection(set(totn_no3_stns))

print(f'There are {len(sel_stns)} stations with TOC, TOTN and NO3 meeting the specified criteria.\n'
      f'In addition, there are {len(extra_it_stns)} Italian stations with adequate TOTN and NO3 datasets.')

sel_stns = list(extra_it_stns.union(sel_stns))
print(f'{len(sel_stns)} stations in total.')

There are 287 stations with TOC, TOTN and NO3 meeting the specified criteria.
In addition, there are 6 Italian stations with adequate TOTN and NO3 datasets.
293 stations in total.


In [9]:
# Save stations
sel_stn_df = stn_df.query('station_id in @sel_stns')
csv_path = f"./results/trends_1992-2016_toc_totn_no3_relax_italy/trends_1992-2016_toc_totn_no3_relax_italy_stations.csv"
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [10]:
%%capture

st_yr = 1992

# Variables of interest
var_list = [
    "TOTN_µg/l N",
    "NO3-N_µg/l N",
    "NH4-N_µg/l N",
    "TON_µg/l N",
    "TOC_mg C/l",
    "TOTP_µg/l P",
    "TOC/TON",
    "TOTN/TOTP",
    "NO3/TOTP",
    "TOC/TOTP",
]

# Dicts for results
res_dict = {
    "station_id": [],
    "variable": [],
    "median": [],
    "mk_p_val": [],
    "mk_trend": [],
    "sen_slp": [],
    "sen_incpt": [],
    "sen_trend": [],
}

# Loop over stations
for stn_id in sel_stns:
    df = ann_df.query("station_id == @stn_id").copy()
    df = df.query("year >= @st_yr")
    del df["station_id"]
    df.set_index("year", inplace=True)
    df.sort_index(inplace=True)

    # Setup plot
    fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(12, 16))
    axes = axes.flatten()

    # Loop over pars
    for idx, par in enumerate(var_list):
        # Determine whether to plot series
        inc = inc_df.query("(station_id == @stn_id) and (variable == @par) and (st_yr == @st_yr)")[
            "include"
        ].values[0]

        if inc == 0:
            # Plot "omitted" text
            axes[idx].text(
                0.5,
                0.5,
                "Omitted due to lack of data",
                verticalalignment="center",
                horizontalalignment="center",
                transform=axes[idx].transAxes,
                fontsize=18,
            )
            axes[idx].set_title(par)

        else:
            # MK test
            mk_df = nivapy.stats.mk_test(df, par)

            # Sen's slope
            res_df, sen_df = nivapy.stats.sens_slope(
                df, value_col=par, index_col=df.index
            )

            # Add results to dict
            res_dict["station_id"].append(stn_id)
            res_dict["variable"].append(par)
            res_dict["median"].append(df[par].median())
            res_dict["mk_p_val"].append(mk_df.loc["p"].value)
            res_dict["mk_trend"].append(mk_df.loc["trend"].value)

            sslp = res_df.loc["sslp"].value
            sincpt = res_df.loc["icpt"].value
            res_dict["sen_slp"].append(sslp)
            res_dict["sen_incpt"].append(sincpt)
            res_dict["sen_trend"].append(res_df.loc["trend"].value)

            # Plot
            axes[idx].plot(sen_df.index, sen_df[par].values, "bo-")
            axes[idx].plot(sen_df.index, sen_df.index * sslp + sincpt, "k-")

            axes[idx].set_title(par)
            axes[idx].set_xlim((st_yr, 2016))

    # Save plot
    plt.tight_layout()
    png_path = f"./results/trends_1992-2016_toc_totn_no3_relax_italy/trends_1992-2016_toc_totn_no3_relax_italy_stn_{stn_id}.png"
    plt.savefig(png_path, dpi=200)
    plt.close()

# Combine results
res_df = pd.DataFrame(res_dict)

# Save
csv_path = f"./results/trends_1992-2016_toc_totn_no3_relax_italy/trends_1992-2016_toc_totn_no3_relax_italy_results.csv"
res_df.to_csv(csv_path, index=False)

In [11]:
res_df.head()

Unnamed: 0,station_id,variable,median,mk_p_val,mk_trend,sen_slp,sen_incpt,sen_trend
0,23562,TOTN_µg/l N,403.75,0.004562,decreasing,-8.575,17609.4875,decreasing
1,23562,NO3-N_µg/l N,337.0,0.026522,decreasing,-4.571809,9494.334245,decreasing
2,23562,NH4-N_µg/l N,6.0,0.614862,no trend,0.038462,-71.038462,no trend
3,23562,TON_µg/l N,70.780759,0.300427,no trend,0.829244,-1593.097205,no trend
4,23562,TOTP_µg/l P,2.0,0.245342,no trend,0.033421,-64.942891,no trend


### 5.2. Stations with TOTN and NO3 from 2000

From Kari's e-mail:

> Make an additional analysis from 2000, ignoring TOC in the site selection. Then we can say something about trends in Central Europe as well. But we will probably not do an in depth analysis of these data, with explanatory factors etc. But it gives some credit

In [12]:
# Get list of stations
sel_stns = set(
    inc_df.query("(st_yr == 2000) and (variable == 'TOTN_NO3') and (include == 1)")[
        "station_id"
    ].values
)

print(f'There are {len(sel_stns)} stations with TOTN and NO3 meeting the specified criteria.')

There are 310 stations with TOTN and NO3 meeting the specified criteria.


In [13]:
# Save stations
sel_stn_df = stn_df.query('station_id in @sel_stns')
csv_path = f"./results/trends_2000-2016_totn_no3/trends_2000-2016_totn_no3_stations.csv"
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [14]:
%%capture

st_yr = 2000

# Variables of interest
var_list = [
    "TOTN_µg/l N",
    "NO3-N_µg/l N",
    "NH4-N_µg/l N",
    "TON_µg/l N",
    "TOC_mg C/l",
    "TOTP_µg/l P",
    "TOC/TON",
    "TOTN/TOTP",
    "NO3/TOTP",
    "TOC/TOTP",
]

# Dicts for results
res_dict = {
    "station_id": [],
    "variable": [],
    "median": [],
    "mk_p_val": [],
    "mk_trend": [],
    "sen_slp": [],
    "sen_incpt": [],
    "sen_trend": [],
}

# Loop over stations
for stn_id in sel_stns:
    df = ann_df.query("station_id == @stn_id").copy()
    df = df.query("year >= @st_yr")
    del df["station_id"]
    df.set_index("year", inplace=True)
    df.sort_index(inplace=True)

    # Setup plot
    fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(12, 16))
    axes = axes.flatten()

    # Loop over pars
    for idx, par in enumerate(var_list):
        # Determine whether to plot series
        inc = inc_df.query("(station_id == @stn_id) and (variable == @par) and (st_yr == @st_yr)")[
            "include"
        ].values[0]

        if inc == 0:
            # Plot "omitted" text
            axes[idx].text(
                0.5,
                0.5,
                "Omitted due to lack of data",
                verticalalignment="center",
                horizontalalignment="center",
                transform=axes[idx].transAxes,
                fontsize=18,
            )
            axes[idx].set_title(par)

        else:
            # MK test
            mk_df = nivapy.stats.mk_test(df, par)

            # Sen's slope
            res_df, sen_df = nivapy.stats.sens_slope(
                df, value_col=par, index_col=df.index
            )

            # Add results to dict
            res_dict["station_id"].append(stn_id)
            res_dict["variable"].append(par)
            res_dict["median"].append(df[par].median())
            res_dict["mk_p_val"].append(mk_df.loc["p"].value)
            res_dict["mk_trend"].append(mk_df.loc["trend"].value)

            sslp = res_df.loc["sslp"].value
            sincpt = res_df.loc["icpt"].value
            res_dict["sen_slp"].append(sslp)
            res_dict["sen_incpt"].append(sincpt)
            res_dict["sen_trend"].append(res_df.loc["trend"].value)

            # Plot
            axes[idx].plot(sen_df.index, sen_df[par].values, "bo-")
            axes[idx].plot(sen_df.index, sen_df.index * sslp + sincpt, "k-")

            axes[idx].set_title(par)
            axes[idx].set_xlim((st_yr, 2016))

    # Save plot
    plt.tight_layout()
    png_path = f"./results/trends_2000-2016_totn_no3/trends_2000-2016_totn_no3_stn_{stn_id}.png"
    plt.savefig(png_path, dpi=200)
    plt.close()

# Combine results
res_df = pd.DataFrame(res_dict)

# Save
csv_path = f"./results/trends_2000-2016_totn_no3/trends_2000-2016_totn_no3_results.csv"
res_df.to_csv(csv_path, index=False)

### 5.3. Stations with NO3 from 1992

From Kari's e-mail:

> In addition to the above we will still do the NO3 trends, selecting only for NO3. But maybe we should do that as well from 1992, and with 50%, for consistency

In [15]:
# Get list of stations
sel_stns = set(
    inc_df.query("(st_yr == 1992) and (variable == 'NO3-N_µg/l N') and (include == 1)")[
        "station_id"
    ].values
)

print(f'There are {len(sel_stns)} stations with NO3 meeting the specified criteria.')

There are 498 stations with NO3 meeting the specified criteria.


In [16]:
# Save stations
sel_stn_df = stn_df.query('station_id in @sel_stns')
csv_path = f"./results/trends_1992-2016_no3/trends_1992-2016_no3_stations.csv"
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [17]:
%%capture

st_yr = 1992

# Variables of interest
var_list = [
    "TOTN_µg/l N",
    "NO3-N_µg/l N",
    "NH4-N_µg/l N",
    "TON_µg/l N",
    "TOC_mg C/l",
    "TOTP_µg/l P",
    "TOC/TON",
    "TOTN/TOTP",
    "NO3/TOTP",
    "TOC/TOTP",
]

# Dicts for results
res_dict = {
    "station_id": [],
    "variable": [],
    "median": [],
    "mk_p_val": [],
    "mk_trend": [],
    "sen_slp": [],
    "sen_incpt": [],
    "sen_trend": [],
}

# Loop over stations
for stn_id in sel_stns:
    df = ann_df.query("station_id == @stn_id").copy()
    df = df.query("year >= @st_yr")
    del df["station_id"]
    df.set_index("year", inplace=True)
    df.sort_index(inplace=True)

    # Setup plot
    fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(12, 16))
    axes = axes.flatten()

    # Loop over pars
    for idx, par in enumerate(var_list):
        # Determine whether to plot series
        inc = inc_df.query("(station_id == @stn_id) and (variable == @par) and (st_yr == @st_yr)")[
            "include"
        ].values[0]

        if inc == 0:
            # Plot "omitted" text
            axes[idx].text(
                0.5,
                0.5,
                "Omitted due to lack of data",
                verticalalignment="center",
                horizontalalignment="center",
                transform=axes[idx].transAxes,
                fontsize=18,
            )
            axes[idx].set_title(par)

        else:
            # MK test
            mk_df = nivapy.stats.mk_test(df, par)

            # Sen's slope
            res_df, sen_df = nivapy.stats.sens_slope(
                df, value_col=par, index_col=df.index
            )

            # Add results to dict
            res_dict["station_id"].append(stn_id)
            res_dict["variable"].append(par)
            res_dict["median"].append(df[par].median())
            res_dict["mk_p_val"].append(mk_df.loc["p"].value)
            res_dict["mk_trend"].append(mk_df.loc["trend"].value)

            sslp = res_df.loc["sslp"].value
            sincpt = res_df.loc["icpt"].value
            res_dict["sen_slp"].append(sslp)
            res_dict["sen_incpt"].append(sincpt)
            res_dict["sen_trend"].append(res_df.loc["trend"].value)

            # Plot
            axes[idx].plot(sen_df.index, sen_df[par].values, "bo-")
            axes[idx].plot(sen_df.index, sen_df.index * sslp + sincpt, "k-")

            axes[idx].set_title(par)
            axes[idx].set_xlim((st_yr, 2016))

    # Save plot
    plt.tight_layout()
    png_path = f"./results/trends_1992-2016_no3/trends_1992-2016_no3_stn_{stn_id}.png"
    plt.savefig(png_path, dpi=200)
    plt.close()

# Combine results
res_df = pd.DataFrame(res_dict)

# Save
csv_path = f"./results/trends_1992-2016_no3/trends_1992-2016_no3_results.csv"
res_df.to_csv(csv_path, index=False)