In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
import mpld3
import pandas as pd
import imp
import numpy as np
import os
from sqlalchemy import create_engine

sn.set_context('notebook')

# RID 2016-17: data processing notebook

Previous notebooks have developed the tools and methodology required to implement the RID workflow for 2016-17. This notebook performs the actual processing and keeps a record of what has been done.

## 1. Add 2016 datasets

### 1.1. Update flow datasets

[Notebook 2](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/update_flow_datasets.ipynb) documents the processing and upload of the NVE flow data (both modelled and observed) for 2016. The RESA2 database now contains a complete record of all the discharge data required for this year's RID processing.

### 1.2. Water chemistry

The water samples collected for this project are analysed by the NIVA laboratory and results are automatically transferred to the RESA2 database. Liv Bente has now quality-checked these data and any necessary corrections have been made.

## 2. Estimate observed loads

### 2.1. Loads for all rivers in 2016

The code below is taken from Section 2 of [notebook 3](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/estimate_loads.ipynb), but this time run using the 2016 data. Loads are calculated directly from contemporary observations for the RID_11 and RID_36 sites, and they are inferred from historic concentrations for the RID_108 sites.

**Remember to change the year below!**

In [2]:
# Connect to db
resa2_basic_path = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\Upload_Template'
                    r'\useful_resa2_code.py')
resa2_basic = imp.load_source('useful_resa2_code', resa2_basic_path)
engine, conn = resa2_basic.connect_to_resa2()

# Import custom RID functions
rid_func_path = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
                 r'\Python\rid\notebooks\useful_rid_code.py')
rid = imp.load_source('useful_rid_code', rid_func_path)

In [3]:
# Read site data
in_xlsx = r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet\Data\RID_Sites_List.xlsx'

rid_11_df = pd.read_excel(in_xlsx, sheetname='RID_11')
rid_36_df = pd.read_excel(in_xlsx, sheetname='RID_36')
rid_108_df = pd.read_excel(in_xlsx, sheetname='RID_108')

# Drop the 37th site (with no NVE code) from RID_36
rid_36_df.dropna(how='any', inplace=True)

In [5]:
# Sites of interest: combine all site dfs into one
rid_all_df = pd.concat([rid_11_df, rid_36_df, rid_108_df], axis=0)

# Pars of interest
par_list = ['SPM', 'TOC', 'PO4-P', 'TOTP', 'NO3-N', 'NH4-N', 
            'TOTN', 'SiO2', 'Ag', 'As', 'Pb', 'Cd', 'Cu', 
            'Zn', 'Ni', 'Cr', 'Hg']

# Year of interest
year = 2016

# Container for results from each site
loads_list = []

# Loop over sites
for stn_id in rid_all_df['station_id'].values:
    # Estimate loads at this site
    loads_list.append(rid.estimate_loads(stn_id, par_list, 
                                         year, engine,
                                         infer_missing=True))

# Concatenate to new df
lds_all = pd.concat(loads_list, axis=0)

# Save output
out_csv = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
           r'\Results\Loads_CSVs\loads_all_sites_%s.csv' % year)
lds_all.to_csv(out_csv, index_label='station_id')

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are

### 2.2. Loads for the RID_11 rivers through time

The code below is taken from Section 3 of [notebook 3](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/estimate_loads.ipynb), but this time the bar charts include data from 2016.

**Remember to change the year below!**

In [6]:
# Period of interest
st_yr, end_yr = 1990, 2016

# Container for results 
loads_list = []

# Loop over sites
for stn_id in rid_11_df['station_id'].values:
    # Loop over years
    for year in range(st_yr, end_yr+1):
        print 'Processing Station ID %s for %s' % (stn_id, year)
        
        # Get loads
        l_df = rid.estimate_loads(stn_id, par_list, 
                                  year, engine,
                                  infer_missing=True)
        
        if l_df is not None:
            # Name and reset index
            l_df.index.name = 'station_id'
            l_df.reset_index(inplace=True)

            # Add year
            l_df['year'] = year

            # Add to outout
            loads_list.append(l_df)

# Concatenate to new df
lds_ts = pd.concat(loads_list, axis=0)

# Build multi-index
lds_ts.set_index(['station_id', 'year'], inplace=True)

# Save output
out_csv = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
           r'\Results\Loads_CSVs\loads_ts_11_%s-%s.csv' % (st_yr, end_yr))
lds_ts.to_csv(out_csv)

Processing Station ID 29615 for 1990
    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

Processing Station ID 29615 for 1991
    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

Processing Station ID 29615 for 1992
    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

Processing Station ID 29615 for 1993
Processing Station ID 29615 for 1994
    The database contains duplicated v

**Remember to change the year below!**

In [7]:
%%capture
# This code cell produces lots of Deprecation Warnings from Seaborn/Pandas.
# %%capture suppresses all output from this cell to keep things tidy

# Output folder for plots
out_fold = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
            r'\Results\TS_Plots\RID_Plots_To_2016')

# Loop over df
for stn_id in rid_11_df['station_id'].values:
    # Get data for this station
    df = lds_ts.ix[stn_id]
    
    # Separate est and val cols to two dfs
    cols = df.columns
    est_cols = [i for i in cols if i.split('_')[1]=='Est']
    val_cols = [i for i in cols if i.split('_')[1]!='Est']    
    val_df = df[val_cols]
    est_df = df[est_cols]
    
    # Convert to "long" format
    val_df.reset_index(inplace=True)
    val_df = pd.melt(val_df, id_vars='year', var_name='par_unit')    
    est_df.reset_index(inplace=True)
    est_df = pd.melt(est_df, id_vars='year', var_name='par_est', value_name='est')
    
    # Get just par for joining
    val_df['par'] = val_df['par_unit'].str.split('_', expand=True)[0]
    est_df['par'] = est_df['par_est'].str.split('_', expand=True)[0]
    
    # Join
    df = pd.merge(val_df, est_df, how='left',
                  on=['year', 'par'])
    
    # Extract cols of interest
    df = df[['year', 'par_unit', 'value', 'est']]

    # Plot
    g = sn.factorplot(x='year', y='value', hue='est',
                      col='par_unit', col_wrap=3,
                      data=df, 
                      kind='bar',
                      dodge=False,
                      sharex=False,
                      sharey=False,
                      alpha=0.5,
                      aspect=2,
                      legend=False)
    
    # Rotate tick labels and tidy
    for ax in g.axes.flatten(): 
        for tick in ax.get_xticklabels(): 
            tick.set(rotation=45)
    plt.tight_layout()
    
    # Save
    out_path = os.path.join(out_fold, '%s.png' % stn_id)
    plt.savefig(out_path, dpi=200)
    plt.close()

## 3. Generate output tables for Word

### 3.1. Table 1: Raw water chemistry

The code below is based on Section 2 of [notebook 5](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/word_data_tables.ipynb).

**Remember to change the year below!**

In [10]:
# Concatenate data for RID_11 and RID_36 sites
stn_df = pd.concat([rid_11_df, rid_36_df], axis=0)

# Path to *COPIED* template for editing
in_docx = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
           r'\Results\Word_Tables\2017Analysis_2016Data\Table_1_2016.docx')

# Write tables
rid.write_word_water_chem_tables(stn_df, 2016, in_docx, engine)

Processing: Glomma ved Sarpsfoss
    Extracting water chemistry data...
    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    Extracting flow data...
    Writing sample dates...
    Deleting empty rows...
    Writing data values...
    Writing summary statistics...
    Done.
Processing: Alna
    Extracting water chemistry data...
    The database contains duplicated values for some station-date-parameter combinations.
    Only the most recent values will be used, but you should check the repeated values are not errors.
    The duplicated entries are returned in a separate dataframe.

    Extracting flow data...
    Writing sample dates...
    Deleting empty rows...
    Writing data values...
    Writing summary statistics...
    Done.
Processing: Drammenselva
    Extracting water 

### 3.2. Table 3: Estimated loads at each site

The code below is based on Section 3 of [notebook 5](http://nbviewer.jupyter.org/github/JamesSample/rid/blob/master/notebooks/word_data_tables.ipynb).

**Remember to change the year in the file path below!**

In [11]:
# Concatenate data for RID_11, RID_36 and RID_108 sites
stn_df = pd.concat([rid_11_df, rid_36_df, rid_108_df], axis=0)

# Path to *COPIED* template for editing
in_docx = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
           r'\Results\Word_Tables\2017Analysis_2016Data\Table_2_2016.docx')

# Read loads data (from "loads notebook")
loads_csv = (r'C:\Data\James_Work\Staff\Oeyvind_K\Elveovervakingsprogrammet'
             r'\Results\Loads_CSVs\loads_all_sites_2016.csv')

# Write tables
rid.write_word_loads_table(stn_df, loads_csv, in_docx, engine)

Processing:
    Numedalslågen (VESENUM)...
    Vosso(Bolstadelvi) (HOREVOS)...
    Orreelva (ROGEORR)...
    Skienselva (TELESKI)...
    Otra (VAGEOTR)...
    Vefsna (NOREVEF)...
    Alna (OSLEALN)...
    Glomma ved Sarpsfoss (ØSTEGLO)...
    Altaelva (FINEALT)...
    Drammenselva (BUSEDRA)...
    Orkla (STREORK)...
    Årdalselva (ROGEÅRD)...
    Tista utløp Femsjøen (ØSTETIS)...
    Gaula (STREGAU)...
    Driva (MROEDRI)...
    Snåsavassdraget (NTRESNÅ)...
    Ranaelva (NORERAN)...
    Lyseelva (ROGELYS)...
    Beiarelva (NOREBEI)...
    Barduelva (TROEBAR)...
    Målselv (TROEMÅL)...
    Suldalslågen (ROGESUL)...
    Figgjoelva (ROGEFIG)...
    Nausta (SFJENAU)...
    Sira (VAGESIR)...
    Bjerkreimselva (ROGEBJE)...
    Kvina (VAGEKVI)...
    Tokkeelva (TELETOK)...
    Tanaelva (FINETAN)...
    Jostedøla (SFJEJOS)...
    Saudaelva (ROGESAU)...
    Stjørdalselva (STRESTJ)...
    Ulladalsåna (Ulla) (ROGEULL)...
    Lyngdalselva (VAGELYN)...
    Surna (MROESUR)...
    Namsen (NTRENAM)