In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from datetime import datetime

# warnings.filterwarnings("ignore")
plt.style.use("ggplot")

# ICPW Thematic Report 2020 - Nitrogen (Part 4: Median water chemistry 2012 to 2016)

See e-mail from Kari received 30.04.2020 at 16.36 (and the associated e-mail chain).

## 1. Read raw data

In [2]:
# Read stations
stn_path = r"../data/all_icpw_sites_may_2019.xlsx"
stn_df = pd.read_excel(stn_path, sheet_name="all_icpw_stns")

# Check stn numbers seem OK
trend_df = stn_df.query("group in ('Trends', 'Trends+Core')")
core_df = stn_df.query("group in ('Core', 'Trends+Core')")

print(f"There are {len(stn_df)} unique stations within the ICPW project as a whole.")
stn_df.head()

There are 556 unique stations within the ICPW project as a whole.


Unnamed: 0,station_id,station_code,station_name,latitude,longitude,altitude,continent,country,region,group
0,38115,Tr18_CA_DO1,Blue Chalk Lake,45.1999,-78.9432,344.0,North America,Canada,Ont,Trends
1,38116,Tr18_CA_DO2,Chub Lake,45.2138,-78.9836,343.0,North America,Canada,Ont,Trends
2,38117,Tr18_CA_DO3,Crosson Lake,45.084,-79.036,371.0,North America,Canada,Ont,Trends
3,38118,Tr18_CA_DO4,Dickie Lake,45.151,-79.0876,379.0,North America,Canada,Ont,Trends
4,38119,Tr18_CA_DO5,Harp Lake,45.3798,-79.1335,327.0,North America,Canada,Ont,Trends


In [3]:
# Read saved chem data
csv_path = r"./results/thematic_n_report_2020_working_data.csv"
wc_df = pd.read_csv(csv_path, encoding="utf-8")
wc_df["sample_date"] = pd.to_datetime(wc_df["sample_date"], format="%Y-%m-%d")

wc_df.head()

Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,CH03,Lago di Tomè,1990-10-08,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,CH03,Lago di Tomè,1993-08-18,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,CH03,Lago di Tomè,1995-09-05,0.0,0.0,0.0,510.0,,,,,,,,
3,23472,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,450.0,,,,,,,,
4,23472,CH03,Lago di Tomè,1997-08-05,0.0,0.0,10.0,330.0,,,,,,,,


## 2. Aggregate to annual medians

In [4]:
# Annual medians by station
wc_df["year"] = wc_df["sample_date"].dt.year
ann_df = wc_df.groupby(["station_id", "year"]).median().reset_index()

ann_df.head()

Unnamed: 0,station_id,year,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,1990,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,1993,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,1995,0.0,0.0,0.0,480.0,,,,,,,,
3,23472,1997,0.0,0.0,10.0,350.0,,,,,,,,
4,23472,2000,0.0,0.0,23.0,506.262169,,764.731,,235.468831,,,,


## 3. Subset to stations and time periods

In [5]:
# Melt to long format
df = ann_df.copy()
del df["depth1"], df["depth2"]
df = pd.melt(df, id_vars=["station_id", "year"])
df.dropna(how="any", inplace=True)
df.head()

Unnamed: 0,station_id,year,variable,value
0,23472,1990,NH4-N_µg/l N,20.0
1,23472,1993,NH4-N_µg/l N,30.0
2,23472,1995,NH4-N_µg/l N,0.0
3,23472,1997,NH4-N_µg/l N,10.0
4,23472,2000,NH4-N_µg/l N,23.0


In [6]:
# Define selection criteria
yrs_thresh = 3  # Min number of annual medians within st_yr - 16
st_yr = 2012

# Dict for results
inc_dict = {
    "station_id": [],
    "variable": [],
    "st_yr": [],
    "include": [],
}

# Loop over time series
for stn_id in df["station_id"].unique():
    # Loop over variables
    for par in df["variable"].unique():
        # Get data
        stn_par_df = df.query("(station_id == @stn_id) and (variable == @par)")
        stn_par_df.set_index("year", inplace=True)
        del stn_par_df["station_id"], stn_par_df["variable"]

        # Years from st_yr to 2016
        years = range(st_yr, 2017)
        years_df = pd.DataFrame(index=years)
        stn_par_yr_df = years_df.join(stn_par_df)

        if pd.isna(stn_par_yr_df["value"]).all().all():
            # Not suitable
            inc_dict["station_id"].append(stn_id)
            inc_dict["variable"].append(par)
            inc_dict["st_yr"].append(st_yr)
            inc_dict["include"].append(0)

        else:
            non_missing = pd.notnull(stn_par_yr_df["value"]).sum()

            if non_missing >= yrs_thresh:
                # Include
                inc_dict["station_id"].append(stn_id)
                inc_dict["variable"].append(par)
                inc_dict["st_yr"].append(st_yr)
                inc_dict["include"].append(1)

            else:
                # Not suitable
                inc_dict["station_id"].append(stn_id)
                inc_dict["variable"].append(par)
                inc_dict["st_yr"].append(st_yr)
                inc_dict["include"].append(0)

# Build df
inc_df = pd.DataFrame(inc_dict)

# Calculate bool cols describing presence of *combinations* of params
# Unstack to 'wide'
inc_df.set_index(["station_id", "variable", "st_yr"], inplace=True)
inc_df = inc_df.unstack("variable")
inc_df.columns = inc_df.columns.get_level_values(1)
inc_df.reset_index(inplace=True)

# Are all params present?
inc_df["TOTN_NO3"] = ((inc_df["TOTN_µg/l N"] + inc_df["NO3-N_µg/l N"]) == 2).astype(int)

inc_df["TOC_TOTN_NO3"] = (
    (inc_df["TOC_mg C/l"] + inc_df["TOTN_µg/l N"] + inc_df["NO3-N_µg/l N"]) == 3
).astype(int)

inc_df["TOC_TOTN_NO3_NH4"] = (
    (
        inc_df["TOC_mg C/l"]
        + inc_df["TOTN_µg/l N"]
        + inc_df["NO3-N_µg/l N"]
        + inc_df["NH4-N_µg/l N"]
    )
    == 4
).astype(int)

# Melt back to 'long'
inc_df = inc_df.melt(id_vars=["station_id", "st_yr"], value_name="include",)

# Sum number of stations
nstns_df = inc_df.groupby(["variable", "st_yr"]).sum().reset_index()
nstns_df.rename({"include": "n_stns"}, inplace=True, axis="columns")
del nstns_df["station_id"]

nstns_df

Unnamed: 0,variable,st_yr,n_stns
0,NH4-N_µg/l N,2012,382
1,NO3-N_µg/l N,2012,494
2,NO3/TOTP,2012,361
3,TOC/TON,2012,225
4,TOC/TOTP,2012,355
5,TOC_TOTN_NO3,2012,310
6,TOC_TOTN_NO3_NH4,2012,225
7,TOC_mg C/l,2012,493
8,TON_µg/l N,2012,230
9,TOTN/TOTP,2012,234


## 4. Medians 2012 to 2016

**Note:** In all the output files created in the section below, I have calculated medians for ***all*** parameters. However, **only the specified parameters are guaranteed to meet the selection criteria** - other columns may be based on less data. 

### 4.1. Stations with TOC, TOTN, NO3 and NH4

And TON calculated as TOTN - NO3 - NH4.

In [7]:
# Get list of stations
sel_stns = set(
    inc_df.query(
        "(st_yr == 2012) and (variable == 'TOC_TOTN_NO3_NH4') and (include == 1)"
    )["station_id"].values
)

print(
    f"There are {len(sel_stns)} stations with TOC, TOTN, NO3 and NH4 meeting the specified criteria."
)

There are 225 stations with TOC, TOTN, NO3 and NH4 meeting the specified criteria.


In [8]:
# Save stations
sel_stn_df = stn_df.query("station_id in @sel_stns")
csv_path = (
    f"./results/medians_2012-2016/medians_2012-2016_toc_totn_no3_nh4_stations.csv"
)
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [9]:
# Calculate medians
med_df = ann_df.query("(station_id in @sel_stns) and (year >= 2012)")
med_df = med_df.groupby("station_id").median().reset_index()

del med_df["depth1"], med_df["depth2"], med_df["year"]

csv_path = f"./results/medians_2012-2016/medians_2012-2016_toc_totn_no3_nh4.csv"
med_df.to_csv(csv_path, index=False)

med_df.head()

Unnamed: 0,station_id,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23516,51.0,753.0,0.935,840.0,8.0,76.0,91.875,71.8125,8.148144,72.604167
1,23562,7.624085,309.896016,0.427076,400.0,2.0,83.103984,161.5,126.981494,4.907754,213.538022
2,23563,7.180843,624.35781,1.060873,774.852659,4.0,119.637278,196.149772,163.756698,5.713171,258.22749
3,23564,19.453592,240.989552,0.744111,400.0,4.971572,122.225338,86.85,51.4,6.076205,155.514181
4,23565,5.0,607.5,0.87,675.0,6.0,118.5,138.214286,112.982143,7.48494,146.785714


### 4.2. Stations with NO3

And all other parameters calculated as above.

In [10]:
# Get list of stations
sel_stns = set(
    inc_df.query("(st_yr == 2012) and (variable == 'NO3-N_µg/l N') and (include == 1)")[
        "station_id"
    ].values
)

print(f"There are {len(sel_stns)} stations where NO3 meets the specified criteria.")

There are 494 stations where NO3 meets the specified criteria.


In [11]:
# Save stations
sel_stn_df = stn_df.query("station_id in @sel_stns")
csv_path = f"./results/medians_2012-2016/medians_2012-2016_no3_stations.csv"
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [12]:
# Calculate medians
med_df = ann_df.query("(station_id in @sel_stns) and (year >= 2012)")
med_df = med_df.groupby("station_id").median().reset_index()

del med_df["depth1"], med_df["depth2"], med_df["year"]

csv_path = f"./results/medians_2012-2016/medians_2012-2016_no3.csv"
med_df.to_csv(csv_path, index=False)

med_df.head()

Unnamed: 0,station_id,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,15.011,371.0,0.58,,3.0,,,106.955128,,310.160428
1,23474,12.0,180.0,0.65,,2.366541,,,35.640394,,125.333333
2,23475,12.0,132.0,0.7,,3.12,,,32.421105,,142.666667
3,23478,14.523,128.0,0.43,,2.748,,,50.801282,,60.52526
4,23488,28.16,240.0,0.71,,3.0,,,82.934118,,200.133519


### 4.3. Stations with TOC, TOTN and NO3

And TON calculated as TOTN - NO3.

#### 4.3.1. Re-calculate TON and parameter ratios

As above, but this time ignoring NH4.

In [13]:
# TON
wc_df["TON_µg/l N"] = wc_df["TOTN_µg/l N"] - wc_df["NO3-N_µg/l N"]

# TOTN/TOTP
wc_df["TOTN/TOTP"] = wc_df["TOTN_µg/l N"] / wc_df["TOTP_µg/l P"]

# NO3/TOTP
wc_df["NO3/TOTP"] = wc_df["NO3-N_µg/l N"] / wc_df["TOTP_µg/l P"]

# TOC/TON
wc_df["TOC/TON"] = 1000 * wc_df["TOC_mg C/l"] / wc_df["TON_µg/l N"]

# TOC/TOTP
wc_df["TOC/TOTP"] = 1000 * wc_df["TOC_mg C/l"] / wc_df["TOTP_µg/l P"]

#### 4.3.2. Aggregate to annual medians

In [14]:
# Annual medians by station
wc_df["year"] = wc_df["sample_date"].dt.year
ann_df = wc_df.groupby(["station_id", "year"]).median().reset_index()

ann_df.head()

Unnamed: 0,station_id,year,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23472,1990,0.0,0.0,20.0,330.0,,,,,,,,
1,23472,1993,0.0,0.0,30.0,490.0,,,,,,,,
2,23472,1995,0.0,0.0,0.0,480.0,,,,,,,,
3,23472,1997,0.0,0.0,10.0,350.0,,,,,,,,
4,23472,2000,0.0,0.0,23.0,506.262169,,764.731,,258.468831,,,,


#### 4.4.4. Aggregate to overall medians

In [15]:
# Get list of stations
sel_stns = set(
    inc_df.query("(st_yr == 2012) and (variable == 'TOC_TOTN_NO3') and (include == 1)")[
        "station_id"
    ].values
)

print(
    f"There are {len(sel_stns)} stations with TOC, TOTN and NO3 meeting the specified criteria."
)

There are 310 stations with TOC, TOTN and NO3 meeting the specified criteria.


In [16]:
# Save stations
sel_stn_df = stn_df.query("station_id in @sel_stns")
csv_path = f"./results/medians_2012-2016/medians_2012-2016_toc_totn_no3_stations.csv"
sel_stn_df.to_csv(csv_path, index=False)

# Map
nivapy.spatial.quickmap(
    sel_stn_df,
    popup="station_code",
    cluster=True,
    kartverket=True,
    aerial_imagery=True,
)

In [17]:
# Calculate medians
med_df = ann_df.query("(station_id in @sel_stns) and (year >= 2012)")
med_df = med_df.groupby("station_id").median().reset_index()

del med_df["depth1"], med_df["depth2"], med_df["year"]

csv_path = f"./results/medians_2012-2016/medians_2012-2016_toc_totn_no3.csv"
med_df.to_csv(csv_path, index=False)

med_df.head()

Unnamed: 0,station_id,NH4-N_µg/l N,NO3-N_µg/l N,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,TON_µg/l N,TOTN/TOTP,NO3/TOTP,TOC/TON,TOC/TOTP
0,23516,51.0,753.0,0.935,840.0,8.0,131.0,91.875,71.8125,5.699246,72.604167
1,23562,7.624085,309.896016,0.427076,400.0,2.0,90.103984,161.5,126.981494,4.517678,213.538022
2,23563,7.180843,624.35781,1.060873,774.852659,4.0,127.446694,196.149772,163.756698,5.560482,258.22749
3,23564,19.453592,240.989552,0.744111,400.0,4.971572,157.5,86.85,51.4,4.726661,155.514181
4,23565,5.0,607.5,0.87,675.0,6.0,140.5,138.214286,112.982143,5.952729,146.785714
