In [1]:
import pandas as pd
import imp

# Compare means

For the 2017-18 Elveovervåkingsprogrammet, Øyvind would like to compare average concentrations for the 20 "main" rivers over two time periods: (i) 2013 to 2017 inclusive, and (ii) 2017 only. This procedure is complicated slightly by the OSPAR methodology for handling detection limits (described [here](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/rid_data_exploration.ipynb#2.3.2.-Limit-of-detection-values)).

This notebook extracts all the water chemistry data for the stations of interest over the two time periods. It then calculates corrected and uncorrected mean values for each period, for each of the "standard" RID parameters.

 * **Uncorrected means**. The mean of the raw data for the time period, where detection limit values are assumed to be equal to the detection limit <br><br>
 
 * **Corrected means**. The mean of the raw data for the time period, where detection limit values are adjusted according to the [OSPAR methodology](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/rid_data_exploration.ipynb#2.3.2.-Limit-of-detection-values)

In [2]:
# Connect to db
resa2_basic_path = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\Upload_Template'
                    r'\useful_resa2_code.py')
resa2_basic = imp.load_source('useful_resa2_code', resa2_basic_path)
engine, conn = resa2_basic.connect_to_resa2()

# Import custom RID functions
rid_func_path = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
                 r'\Python\rid\notebooks\useful_rid_code.py')
rid = imp.load_source('useful_rid_code', rid_func_path)

## 1. List of stations of interest

In [3]:
# Get list of stations of interest
stn_xlsx = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
            r'\Data\RID_Sites_List_2017-2020.xlsx')
stn_df = pd.read_excel(stn_xlsx, sheet_name ='RID_20')

stn_df.head()

Unnamed: 0,station_id,station_code,station_name,old_rid_group,new_rid_group,ospar_region,station_type,nve_vassdrag_nr,lat,lon,utm_north,utm_east,utm_zone
0,29612,BUSEDRA,Drammenselva,rid_11,rid_20,SKAGERAK,R,012.A3,59.753995,10.00899,6624446.0,556695.0,32
1,29779,FINEALT,Altaelva,rid_11,rid_20,LOFOTEN-BARENTS SEA,R,212.A0,69.900992,23.286977,7759686.0,586586.0,34
2,29820,FINETAN,Tanaelva,rid_36,rid_20,LOFOTEN-BARENTS SEA,R,234.B41,70.229993,28.173988,7791949.0,544316.0,35
3,29821,HOREVOS,Vosso(Bolstadelvi),rid_11,rid_20,NORTH SEA,R,062.C1,60.647,6.112,6726970.0,342124.0,32
4,29782,NOREVEF,Vefsna,rid_11,rid_20,NORWEGIAN SEA2,R,151.A4,65.749,13.239,7293064.0,419297.0,33


## 2. Extract water chemistry and adjust LOD values

In [4]:
# Pars of interest
par_list = ['SPM', 'TOC', 'PO4-P', 'TOTP', 'NO3-N', 'NH4-N', 
            'TOTN', 'SiO2', 'Ag', 'As', 'Pb', 'Cd', 'Cu', 
            'Zn', 'Ni', 'Cr', 'Hg']

# Container for results
df_list = []

# Loop over stations
for stn_id in stn_df['station_id'].values:
    # Get raw data between 2013 and 2017
    wc_df, dup_df = rid.extract_water_chem(stn_id, par_list, 
                                           '2013-01-01', '2017-12-31',
                                           engine, plot=False)

    # Calculate means for uncorrected data
    uncor_df_1 = pd.DataFrame(wc_df.mean()).reset_index()
    uncor_df_1.columns = ['par', 'uncor_mean_2013-17']
    uncor_df_1['par'], uncor_df_1['unit'] = uncor_df_1['par'].str.split('_', 1).str
    uncor_df_1 = uncor_df_1.query('unit != "flag"')
    uncor_df_1['par'] = uncor_df_1['par'] + '_' + uncor_df_1['unit']
    uncor_df_1.index = uncor_df_1['par']
    del uncor_df_1['unit'], uncor_df_1['par']

    # Apply LOD correction
    wc_df = rid.adjust_lod_values(wc_df)

    # Calculate means for corrected data
    cor_df_1 = pd.DataFrame(wc_df.mean()).reset_index()
    cor_df_1.columns = ['par', 'cor_mean_2013-17']
    cor_df_1['par'], cor_df_1['unit'] = cor_df_1['par'].str.split('_', 1).str
    cor_df_1 = cor_df_1.query('unit != "flag"')
    cor_df_1['par'] = cor_df_1['par'] + '_' + cor_df_1['unit']
    cor_df_1.index = cor_df_1['par']
    del cor_df_1['unit'], cor_df_1['par']

    # Get raw data for 2017
    wc_df, dup_df = rid.extract_water_chem(stn_id, par_list, 
                                           '2017-01-01', '2017-12-31',
                                           engine, plot=False)

    # Calculate means for uncorrected data
    uncor_df_2 = pd.DataFrame(wc_df.mean()).reset_index()
    uncor_df_2.columns = ['par', 'uncor_mean_2017']
    uncor_df_2['par'], uncor_df_2['unit'] = uncor_df_2['par'].str.split('_', 1).str
    uncor_df_2 = uncor_df_2.query('unit != "flag"')
    uncor_df_2['par'] = uncor_df_2['par'] + '_' + uncor_df_2['unit']
    uncor_df_2.index = uncor_df_2['par']
    del uncor_df_2['unit'], uncor_df_2['par']

    # Apply LOD correction
    wc_df = rid.adjust_lod_values(wc_df)

    # Calculate means for corrected data
    cor_df_2 = pd.DataFrame(wc_df.mean()).reset_index()
    cor_df_2.columns = ['par', 'cor_mean_2017']
    cor_df_2['par'], cor_df_2['unit'] = cor_df_2['par'].str.split('_', 1).str
    cor_df_2 = cor_df_2.query('unit != "flag"')
    cor_df_2['par'] = cor_df_2['par'] + '_' + cor_df_2['unit']
    cor_df_2.index = cor_df_2['par']
    del cor_df_2['unit'], cor_df_2['par']
    
    # Concatenate to single df
    df = pd.concat([uncor_df_1, cor_df_1, uncor_df_2, cor_df_2], 
                   axis=1, 
                   sort=False)
    df.reset_index(inplace=True)
    
    # Tidy
    df['station_id'] = stn_id
    df = df[['station_id', 'par', 'uncor_mean_2013-17', 
             'cor_mean_2013-17', 'uncor_mean_2017', 'cor_mean_2017']]    
    df_list.append(df)

# Combine results
df = pd.concat(df_list, axis=0, sort=False)

# Join station details
df = pd.merge(df,
              stn_df[['station_id', 'station_code', 'station_name', 
                      'old_rid_group', 'new_rid_group']],
              how='left',
              on='station_id')
df = df[['station_id', 'station_code', 'station_name', 'old_rid_group', 
         'new_rid_group', 'par', 'uncor_mean_2013-17', 'cor_mean_2013-17',
         'uncor_mean_2017', 'cor_mean_2017']] 

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are

In [5]:
# Write output
out_csv = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
           r'\Results\Loads_CSVs\mean_chemistry_2013-17.csv')
df.to_csv(out_csv, index=False, encoding='utf-8')

df.head()

Unnamed: 0,station_id,station_code,station_name,old_rid_group,new_rid_group,par,uncor_mean_2013-17,cor_mean_2013-17,uncor_mean_2017,cor_mean_2017
0,29612,BUSEDRA,Drammenselva,rid_11,rid_20,Ag_µg/l,0.026254,0.004269,0.002,0.0
1,29612,BUSEDRA,Drammenselva,rid_11,rid_20,As_µg/l,0.159686,0.159686,0.152,0.152
2,29612,BUSEDRA,Drammenselva,rid_11,rid_20,Cd_µg/l,0.009189,0.009172,0.008025,0.008025
3,29612,BUSEDRA,Drammenselva,rid_11,rid_20,Cr_µg/l,0.193544,0.191544,0.1425,0.1425
4,29612,BUSEDRA,Drammenselva,rid_11,rid_20,Cu_µg/l,0.827329,0.827329,0.6875,0.6875
