In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import nivapy3 as nivapy
import numpy as np
import pandas as pd
import seaborn as sn
import utils

plt.style.use("ggplot")

ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/share/proj failed


# ICPW Thematic report 2023

## Part B: Statistical analysis

## 1. User options

In [2]:
# Parameters of interest for analysis
par_list = [
    "SO4_µeq/l",
    "NO3-N_µeq/l",
    # "NH4-N_µeq/l",
    "Cl_µeq/l",
    "Ca_µeq/l",
    "Mg_µeq/l",
    "CaMg_µeq/l",
    "H_µeq/l",
    "SAA_µeq/l",
    "ANC_µeq/l",
    "OrgAnions_µeq/l",
    "HCO3_µeq/l",
]

# Periods to consider
periods = ((1990, 2020), (1990, 2004), (1998, 2012), (2006, 2020))

## 2. Read raw data

A basic dataset was compiled in the previous notebook.

**Note:** We have decided not to allow availability of NH4 data to limit the site selection, as it restricts the dataset unnecessarily. See the comment [here](https://github.com/JamesSample/icpw2/issues/4#issuecomment-1521958990) and reply [here](https://github.com/JamesSample/icpw2/issues/4#issuecomment-1522126141).

In [3]:
# Station propertiess
xl_path = r"../../../all_icpw_sites_mar_2023.xlsx"
stn_df = pd.read_excel(xl_path, sheet_name="all_icpw_stns")

# Median annual chemistry
csv_path = r"./data/thematic_report_2023_working_data.csv"
wc_df = pd.read_csv(csv_path, encoding="utf-8")
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d")
wc_df["year"] = wc_df["sample_date"].dt.year
wc_df = wc_df.groupby(["station_id", "year"]).median().reset_index()
wc_df = wc_df[["station_id", "year"] + par_list]

# Selection criteria
csv_path = r"./data/selection_criteria_by_station-par-period.csv"
inc_df = pd.read_csv(csv_path, encoding="utf-8")
inc_df = inc_df[["station_id", "period"] + par_list]
# del inc_df["NH4-N_µeq/l"] # Don't care about NH4 for site selection - see above
inc_df.set_index(["station_id", "period"], inplace=True)
inc_df[inc_df == 0] = np.nan
inc_df.dropna(how="any", inplace=True)
inc_df.reset_index(inplace=True)

In [4]:
# Print number of stations per period with complete data
for period in periods:
    st_yr, end_yr = period
    inc_df_per = inc_df.query(f"period == '{st_yr}-{end_yr}'").copy()
    print(f"{st_yr}-{end_yr}:  ", len(inc_df_per), "sites.")

# Number of stations that have complete data for ALL periods
print(
    "All periods:",
    len(inc_df.groupby(["station_id"])[["period"]].count().query("period == 4")),
    "sites.",
)

1990-2020:   421 sites.
1990-2004:   439 sites.
1998-2012:   457 sites.
2006-2020:   441 sites.
All periods: 403 sites.


After conversation with Rolf, we have decided to use the same site selection for all periods i.e. to focus on the 403 sites, instead of changing the selection in each period.

In [5]:
# Export selected sites as shapefile for mapping
stn_list = (
    inc_df.groupby(["station_id"])[["period"]]
    .count()
    .query("period == 4")
    .index.tolist()
)
sel_stn_df = stn_df.query("station_id in @stn_list")
sel_stn_gdf = gpd.GeoDataFrame(
    sel_stn_df,
    geometry=gpd.points_from_xy(
        sel_stn_df["longitude"], sel_stn_df["latitude"], crs="epsg:4326"
    ),
)
stn_shp = r"./results/gis/vector/selected_stations.shp"
sel_stn_gdf.to_file(stn_shp, index=False)

  sel_stn_gdf.to_file(stn_shp, index=False)


## 3. Site-specific trends

In [5]:
res_dict = {
    "period": [],
    "station_id": [],
    "variable": [],
    "n_vals": [],
    "first": [],
    "last": [],
    "mean": [],
    "median": [],
    "std_dev": [],
    "iqr": [],
    "mk_p_val": [],
    "mk_trend": [],
    "sen_slp": [],
    "sen_incpt": [],
    "sen_trend": [],
}

stn_list = (
    inc_df.groupby(["station_id"])[["period"]]
    .count()
    .query("period == 4")
    .index.tolist()
)
for period in periods:
    st_yr, end_yr = period

    # Uncomment the lines below to allow station selection to vary by period
    # inc_df_per = inc_df.query(f"period == '{st_yr}-{end_yr}'").copy()
    # stn_list = list(inc_df_per["station_id"].unique())

    # Get data that meet selection criteria for all parameters of interest
    df_per = wc_df.query(
        "(station_id in @stn_list) and (@st_yr <= year <= @end_yr)"
    ).copy()

    for stn_id in stn_list:
        stn_code = stn_df.query("station_id == @stn_id")["station_code"].iloc[0]
        stn_name = stn_df.query("station_id == @stn_id")["station_name"].iloc[0]
        df = df_per.query("station_id == @stn_id").copy()
        del df["station_id"]
        df.set_index("year", inplace=True)
        df.sort_index(inplace=True)

        # Setup plot
        if (st_yr == 1990) and (end_yr == 2020):
            fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 16))
            axes = axes.flatten()

        for idx, par in enumerate(par_list):
            # Mann-Kendall and Sen's slope
            mk_df = nivapy.stats.mk_test(df.dropna(subset=[par]), par)
            res_df, sen_df = nivapy.stats.sens_slope(
                df.dropna(subset=[par]),
                value_col=par,
                index_col=df.dropna(subset=[par]).index,
            )

            # Add results to dict
            res_dict["period"].append(f"{st_yr}-{end_yr}")
            res_dict["station_id"].append(stn_id)
            res_dict["variable"].append(par)

            res_dict["n_vals"].append(len(df.dropna(subset=[par])))
            res_dict["first"].append(df[par].dropna().iloc[0])
            res_dict["last"].append(df[par].dropna().iloc[-1])
            res_dict["mean"].append(df[par].mean())
            res_dict["median"].append(df[par].median())
            res_dict["std_dev"].append(df[par].std())
            res_dict["iqr"].append(df[par].quantile(0.75) - df[par].quantile(0.25))

            res_dict["mk_p_val"].append(mk_df.loc["p"].value)
            res_dict["mk_trend"].append(mk_df.loc["trend"].value)

            sslp = res_df.loc["sslp"].value
            sincpt = res_df.loc["icpt"].value
            res_dict["sen_slp"].append(sslp)
            res_dict["sen_incpt"].append(sincpt)
            res_dict["sen_trend"].append(res_df.loc["trend"].value)

            if (st_yr == 1990) and (end_yr == 2020):
                if res_df.loc["trend"].value == "no trend":
                    line_col = "k"
                else:
                    line_col = "r"
                axes[idx].plot(sen_df.index, sen_df[par].values, "bo-")
                axes[idx].plot(sen_df.index, sen_df.index * sslp + sincpt, line_col)
                axes[idx].set_title(par)
                axes[idx].set_xlim((st_yr, end_yr))

        if (st_yr == 1990) and (end_yr == 2020):
            plt.delaxes(axes[-1])
            plt.suptitle(f"{stn_code} ({stn_name})\n", fontsize=20)
            # plt.subplots_adjust(hspace=0.2)
            plt.tight_layout()
            png_path = f"./results/trends_by_site/stn_{stn_id}_{st_yr}-{end_yr}.png"
            plt.savefig(png_path, dpi=200)
            plt.close()

res_df = pd.DataFrame(res_dict)
csv_path = f"./results/trends_by_site.csv"
res_df.to_csv(csv_path, index=False)

res_df.head()

Unnamed: 0,period,station_id,variable,n_vals,first,last,mean,median,std_dev,iqr,mk_p_val,mk_trend,sen_slp,sen_incpt,sen_trend
0,1990-2020,100,SO4_µeq/l,29,60.377924,15.406781,29.299447,26.024967,16.074471,23.94297,1.758091e-10,decreasing,-1.510192,3055.469467,decreasing
1,1990-2020,100,NO3-N_µeq/l,29,1.857143,1.857143,2.102217,1.928571,1.111074,1.5,0.3875675,no trend,0.027129,-52.492445,no trend
2,1990-2020,100,Cl_µeq/l,29,25.385722,14.667306,17.356636,16.923815,3.635294,5.077144,0.0007539426,decreasing,-0.244703,507.797079,decreasing
3,1990-2020,100,Ca_µeq/l,29,17.46507,16.966068,18.118935,18.213573,2.892347,2.994012,0.08014523,no trend,-0.103959,226.754824,no trend
4,1990-2020,100,Mg_µeq/l,29,13.984863,9.049029,10.209234,9.871668,1.926419,2.467917,0.03383443,decreasing,-0.080305,170.963976,no trend


## 4. Regional trends



In [6]:
res_dict = {
    "period": [],
    "region": [],
    "variable": [],
    "n_vals": [],
    "mean": [],
    "median": [],
    "std_dev": [],
    "iqr": [],
    "mk_p_val": [],
    "mk_trend": [],
    "sen_slp": [],
}

stn_list = (
    inc_df.groupby(["station_id"])[["period"]]
    .count()
    .query("period == 4")
    .index.tolist()
)
for period in periods:
    st_yr, end_yr = period

    # Uncomment the lines below to allow station selection to vary by period
    # inc_df_per = inc_df.query(f"period == '{st_yr}-{end_yr}'").copy()
    # stn_list = list(inc_df_per["station_id"].unique())

    # Get data that meet selection criteria for all parameters of interest
    df_per = wc_df.query(
        "(station_id in @stn_list) and (@st_yr <= year <= @end_yr)"
    ).copy()

    df_per = pd.merge(
        df_per, stn_df[["station_id", "region"]], how="left", on="station_id"
    )
    for reg in df_per["region"].unique():
        df = df_per.query("region == @reg").copy()

        for par in par_list:
            stat_df = nivapy.stats.seasonal_regional_mk_sen(
                df.dropna(subset=[par]),
                time_col="year",
                value_col=par,
                block_col="station_id",
                alpha=0.05,
            )

            # Add results to dict
            res_dict["period"].append(f"{st_yr}-{end_yr}")
            res_dict["region"].append(reg)
            res_dict["variable"].append(par)

            res_dict["n_vals"].append(len(df.dropna(subset=[par])))
            res_dict["mean"].append(df[par].mean())
            res_dict["median"].append(df[par].median())
            res_dict["std_dev"].append(df[par].std())
            res_dict["iqr"].append(df[par].quantile(0.75) - df[par].quantile(0.25))

            res_dict["mk_p_val"].append(stat_df.loc["p"].value)
            res_dict["mk_trend"].append(stat_df.loc["trend"].value)
            res_dict["sen_slp"].append(stat_df.loc["sslp"].value)

res_df = pd.DataFrame(res_dict)
csv_path = f"./results/trends_by_region.csv"
res_df.to_csv(csv_path, index=False)

res_df.head()

Unnamed: 0,period,region,variable,n_vals,mean,median,std_dev,iqr,mk_p_val,mk_trend,sen_slp
0,1990-2020,SoNord,SO4_µeq/l,4353,75.350673,58.462487,61.779654,69.247233,0.0,decreasing,-1.769698
1,1990-2020,SoNord,NO3-N_µeq/l,4352,4.38444,2.928571,4.863691,4.25,0.0,decreasing,-0.056122
2,1990-2020,SoNord,Cl_µeq/l,4339,108.611082,78.413674,97.978307,123.242053,0.0,decreasing,-0.214286
3,1990-2020,SoNord,Ca_µeq/l,4349,94.977641,77.345309,72.564776,114.557535,0.0,decreasing,-0.499002
4,1990-2020,SoNord,Mg_µeq/l,4348,53.864769,46.890424,37.207453,53.01888,0.0,decreasing,-0.263038
