In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings

#warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [2]:
# Connect to NIVABASE
eng = nivapy.da.connect()

Username:  ···
Password:  ········


Connection successful.


# ICPW Thematic Report 2020 - Nitrogen (Part 1: Data availability matrices)

Creating an overview of data availability for the ICPW stations.

In [3]:
# Read stations
stn_path = r'../data/all_icpw_sites_may_2019.xlsx'
stn_df = pd.read_excel(stn_path, sheet_name='all_icpw_stns')

# Check stn numbners seem OK
trend_df = stn_df.query("group in ('Trends', 'Trends+Core')")
core_df = stn_df.query("group in ('Core', 'Trends+Core')")

print(f'There are {len(stn_df)} unique stations within the ICPW project as a whole.')
stn_df.head()

There are 556 unique stations within the ICPW project as a whole.


Unnamed: 0,station_id,station_code,station_name,latitude,longitude,altitude,continent,country,region,group
0,38115,Tr18_CA_DO1,Blue Chalk Lake,45.1999,-78.9432,344.0,North America,Canada,Ont,Trends
1,38116,Tr18_CA_DO2,Chub Lake,45.2138,-78.9836,343.0,North America,Canada,Ont,Trends
2,38117,Tr18_CA_DO3,Crosson Lake,45.084,-79.036,371.0,North America,Canada,Ont,Trends
3,38118,Tr18_CA_DO4,Dickie Lake,45.151,-79.0876,379.0,North America,Canada,Ont,Trends
4,38119,Tr18_CA_DO5,Harp Lake,45.3798,-79.1335,327.0,North America,Canada,Ont,Trends


In [4]:
# Specify time period of interest
st_dt = '1990-01-01'
end_dt = '2016-12-31'

# Get available parameters
par_df = nivapy.da.select_resa_station_parameters(stn_df, st_dt, end_dt, eng)
par_df

35 parameters available for the selected stations and dates.


Unnamed: 0,parameter_id,parameter_name,unit
0,65,ALK-E,µEq/l
1,50,Al,µg/l
2,223,As,µg/l
3,959,COLOUR,
4,11,Ca,mg/l
5,15,Cd,µg/l
6,7,Cl,mg/l
7,225,Cr,µg/l
8,16,Cu,µg/l
9,879,DOC,mg/L C


In [5]:
# Get data
wc_df, dup_df = nivapy.da.select_resa_water_chemistry(stn_df,
                                                      [4, 5, 24, 49], # [TotN, NO3, TotP, NH4]
                                                      st_dt,
                                                      end_dt,
                                                      eng,
                                                      lod_flags=False,
                                                      drop_dups=True)
wc_df.head()

The database contains unexpected duplicate values for some station-date-parameter combinations.
Only the most recent values will be used, but you should check the repeated values are not errors.
The duplicated entries are returned in a separate dataframe.



Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOTN_µg/l N,TOTP_µg/l P
,,,,,,,,,,
0.0,23472.0,CH03,Lago di Tomè,1990-10-08,0.0,0.0,20.0,330.0,,
1.0,23472.0,CH03,Lago di Tomè,1993-08-18,0.0,0.0,30.0,490.0,,
2.0,23472.0,CH03,Lago di Tomè,1995-09-05,0.0,0.0,0.0,510.0,,
3.0,23472.0,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,450.0,,
4.0,23472.0,CH03,Lago di Tomè,1997-08-05,0.0,0.0,10.0,330.0,,


In [6]:
# Save for speed
csv_path = r'../../../Thematic_Trends_Report_2020/data_matrices/matrix_samples.csv'
wc_df.to_csv(csv_path, index=False, encoding='utf-8')

In [7]:
# Read saved data
wc_df = pd.read_csv(csv_path, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'], format='%Y-%m-%d')
wc_df.head()

Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,NH4-N_µg/l N,NO3-N_µg/l N,TOTN_µg/l N,TOTP_µg/l P
0,23472,CH03,Lago di Tomè,1990-10-08,0.0,0.0,20.0,330.0,,
1,23472,CH03,Lago di Tomè,1993-08-18,0.0,0.0,30.0,490.0,,
2,23472,CH03,Lago di Tomè,1995-09-05,0.0,0.0,0.0,510.0,,
3,23472,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,450.0,,
4,23472,CH03,Lago di Tomè,1997-08-05,0.0,0.0,10.0,330.0,,


In [8]:
# Restructure
wc_df = wc_df[['station_id', 'sample_date', 'NH4-N_µg/l N', 
               'NO3-N_µg/l N', 'TOTN_µg/l N', 'TOTP_µg/l P']]
wc_df = wc_df.melt(id_vars=['station_id', 'sample_date'],
                   var_name='par')

df = pd.merge(wc_df, stn_df, how='left', on='station_id')
df['year'] = df['sample_date'].dt.year
df = df[['station_id', 'station_code', 'station_name', 
         'continent', 'country', 'region', 'sample_date', 
         'year', 'par', 'value']]

df.head()

Unnamed: 0,station_id,station_code,station_name,continent,country,region,sample_date,year,par,value
0,23472,CH03,Lago di Tomè,Europe,Switzerland,Alps,1990-10-08,1990,NH4-N_µg/l N,20.0
1,23472,CH03,Lago di Tomè,Europe,Switzerland,Alps,1993-08-18,1993,NH4-N_µg/l N,30.0
2,23472,CH03,Lago di Tomè,Europe,Switzerland,Alps,1995-09-05,1995,NH4-N_µg/l N,0.0
3,23472,CH03,Lago di Tomè,Europe,Switzerland,Alps,1995-09-15,1995,NH4-N_µg/l N,0.0
4,23472,CH03,Lago di Tomè,Europe,Switzerland,Alps,1997-08-05,1997,NH4-N_µg/l N,10.0


In [9]:
# Loop over pars
for par in df['par'].unique():
    # Loop over parameters
    df2 = df.query('par == @par')
    df2 = df2.groupby(['station_code', 'year']).count()[['value']]
    df2 = df2.unstack('year')
    df2[df2 == 0] = np.nan
    df2.columns = df2.columns.get_level_values(1)
    
    # Save
    par_name = par.split('_')[0]
    csv_path = f'../../../Thematic_Trends_Report_2020/data_matrices/sample_counts_{par_name}.csv'
    df2.to_csv(csv_path)
    
    # Plot
    grid_kws = {'height_ratios': (.9, .002), 'hspace': .02}
    f, (ax, cbar_ax) = plt.subplots(2, figsize=(8,150), gridspec_kw=grid_kws)

    # Setup colorbar
    cmap = mpl.colors.ListedColormap(['blue', 'yellow', 'orange'])
    cmap.set_over('red')
    cmap.set_under('black')
    bounds = [1, 4, 12, 52]
    norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

    ax = sn.heatmap(df2, 
                    ax=ax,
                    cbar_ax=cbar_ax,
                    linewidths=0.1, 
                    linecolor='white',
                    cmap=cmap,
                    norm=norm,
                    cbar_kws={'orientation': 'horizontal',
                              'ticks': [1, 4, 12, 52],
                              'extend':'both',
                             })
    ax.set_title(par)
    
    # Save
    name = par.split('_')[0]
    plt.savefig(f'../../../Thematic_Trends_Report_2020/data_matrices/data_matrix_{name}.png', 
                dpi=200, 
                bbox_inches = 'tight')
    plt.close()

## Update 02.04.2020

Kari has requested a file summarising all ICPW data - see e-mail received 06.03.2020 at 18.03 for details.

In [10]:
# Get data
wc_df, dup_df = nivapy.da.select_resa_water_chemistry(stn_df,
                                                      par_df,
                                                      st_dt,
                                                      end_dt,
                                                      eng,
                                                      lod_flags=False,
                                                      drop_dups=True)

del wc_df['xxx_']

wc_df.head()

The database contains unexpected duplicate values for some station-date-parameter combinations.
Only the most recent values will be used, but you should check the repeated values are not errors.
The duplicated entries are returned in a separate dataframe.



Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,ALK-E_µEq/l,Al_µg/l,As_µg/l,COLOUR_,...,Pb_µg/l,Qs_m3/s,SO4_mg/l,SiO2_mg SiO2/l,TOC_mg C/l,TOTN_µg/l N,TOTP_µg/l P,Temp_oC,Zn_µg/l,pH_
,,,,,,,,,,,,,,,,,,,,,
0.0,23472.0,CH03,Lago di Tomè,1990-10-08,0.0,0.0,,,,,...,,,2.16,3.171429,,,,9.0,,5.44
1.0,23472.0,CH03,Lago di Tomè,1993-08-18,0.0,0.0,0.0,,,,...,,,2.29,,,,,,,5.44
2.0,23472.0,CH03,Lago di Tomè,1995-09-05,0.0,0.0,4.0,,,,...,3.0,,2.24,1.692857,,,,0.0,7.4,5.58
3.0,23472.0,CH03,Lago di Tomè,1995-09-15,0.0,0.0,0.0,,,,...,3.0,,2.02,1.65,,,,8.2,11.8,5.39
4.0,23472.0,CH03,Lago di Tomè,1997-08-05,0.0,0.0,1.0,28.0,,,...,,,1.85,1.607143,,,,15.9,,5.68


In [11]:
# Save wide format
csv_path = r'../../../Thematic_Trends_Report_2020/data_matrices/all_icpw_samples_wide.csv'
wc_df.to_csv(csv_path, index=False, encoding='utf-8')

In [12]:
# Melt to long format
wc_df_long = pd.melt(wc_df, 
                     id_vars=['station_id', 'station_code', 'station_name', 'sample_date', 'depth1', 'depth2'],
                     var_name='parameter',
                    )

wc_df_long.dropna(how='any', inplace=True)
wc_df_long['parameter'], wc_df_long['unit'] = wc_df_long['parameter'].str.split('_', 1).str

# Re-order cols
cols = list(wc_df_long.columns)
cols.remove('value')
cols.append('value')
wc_df_long = wc_df_long[cols]

wc_df_long.head()

Unnamed: 0,station_id,station_code,station_name,sample_date,depth1,depth2,parameter,unit,value
1,23472,CH03,Lago di Tomè,1993-08-18,0.0,0.0,ALK-E,µEq/l,0.0
2,23472,CH03,Lago di Tomè,1995-09-05,0.0,0.0,ALK-E,µEq/l,4.0
3,23472,CH03,Lago di Tomè,1995-09-15,0.0,0.0,ALK-E,µEq/l,0.0
4,23472,CH03,Lago di Tomè,1997-08-05,0.0,0.0,ALK-E,µEq/l,1.0
5,23472,CH03,Lago di Tomè,1997-09-03,0.0,0.0,ALK-E,µEq/l,4.0


In [13]:
# Save wide format
csv_path = r'../../../Thematic_Trends_Report_2020/data_matrices/all_icpw_samples_long.csv'
wc_df_long.to_csv(csv_path, index=False, encoding='utf-8')