In [1]:
%matplotlib inline
import pandas as pd
import nivapy3 as nivapy
import matplotlib.pyplot as plt
import numpy as np

plt.style.use("ggplot")

In [2]:
eng = nivapy.da.connect()

Username:  ···
Password:  ········


Connection successful.


# 1000 Lakes 2019 (Part 1b: Further cleaning of RESA2)

This notebook attempts to fix further data issues in RESA.

## 1. Duplicated/incorrect dates

The 1000 Lakes samples were split and analysed separately for metals. However, there seems to have been a mix-up, and some samples have come back with different dates. An e-mail from Liv Bente (received 05.06.2020 at 17.14) identifies which samples are duplicates and which are from other projects. The code below merges and corrects the duplicates based on Liv Bente's spreadsheet.

Roar has done something similar in AM.

In [3]:
# Read data from Liv Bente
xl_path = r"../../../quality_control/resa_date_errors_2020-06-11.xlsx"
df = pd.read_excel(xl_path)
df['sample_date'] = pd.to_datetime(df['sample_date'], format='%d.%m.%Y %H.%M.%S')
df['cor_sample_date'] = pd.to_datetime(df['cor_sample_date'], format='%d.%m.%Y %H.%M.%S')
df['cor_depth1'].fillna(0, inplace=True)
df['cor_depth2'].fillna(0, inplace=True)

df.head()

Unnamed: 0,lbs_comment,station_code,dup,station_name,sample_date,cor_sample_date,excel_date,depth1,depth2,cor_depth1,cor_depth2
0,"Dyp er feil, skal være 0 m",434-1-11,1,Abbortjørna,2019-10-15,2019-10-15,2019-10-15,2,2,0.0,0.0
1,"Tusen, dyp & prøvedato ok",434-1-11,1,Abbortjørna,2019-10-15,2019-10-15,2019-10-15,0,0,0.0,0.0
2,"Tusen, prøvedato ok",1866-1-11,1,Austpollvatnan,2019-09-24,2019-09-24,2019-09-24,0,0,0.0,0.0
3,Prøvedato 1 dag feil,1866-1-11,1,Austpollvatnan,2019-09-23,2019-09-24,2019-09-23,0,0,0.0,0.0
4,"Tusen, prøvedato ok",616-2-20,1,Belgevatnet,2019-10-02,2019-10-02,2019-10-02,0,0,0.0,0.0


In [4]:
# Get stations
stn_df = nivapy.da.select_resa_project_stations([4530], eng)
stn_df.head()

Unnamed: 0,station_id,station_code,station_name,latitude,longitude,altitude
0,3167,620-4-6,Ørteren,60.47,7.795,1147.0
1,3168,621-1-27,Flåvatna,60.2,9.183,855.0
2,3169,621-3-5,Soneren,60.061,9.545,104.0
3,3170,622-2-43,Trytetjern,60.213,9.764,275.0
4,3171,622-4-4,Krøderen,60.327,9.645,133.0


The code below is messy. It iterates through the stations in Liv Bente's spreadsheet and corrects/merges the issues highlighted. In most cases, this means taking two water samples and combining them. Note that I haven't favoured any particular sample ID as the "original" - I've just picked one of the duplicates, merged it with the other, then updated the sample properties (date and depth) so they are correct.

In [8]:
## Loop over "problem" stations
#for stn_cde in df['station_code'].unique():
#    stn_id = stn_df.query("station_code == @stn_cde")['station_id'].iloc[0]
#    df2 = df.query("station_code == @stn_cde")
#    
#    if len(df2) == 1:
#        # Just one sample. Check OK
#        assert (df2['sample_date'] == df2['cor_sample_date']).all(), df2
#        assert (df2['depth1'] == df2['cor_depth1']).all(), df2
#        assert (df2['depth2'] == df2['cor_depth2']).all(), df2
#        
#    elif len(df2) == 2:
#        # Get correct date
#        cor_date = df2['cor_sample_date'].unique()
#        assert len(cor_date) == 1
#        cor_date = np.datetime_as_string(cor_date)[0][:10]
#        
#        # Get duplicated water sample IDs
#        ws_ids = []
#        for idx, row in df2.iterrows():
#            date = row['sample_date'].strftime('%Y-%m-%d')
#            dep1 = row['depth1']
#            dep2 = row['depth2']  
#            
#            sql = (f"SELECT * FROM resa2.water_samples "
#                   f"WHERE station_id = {stn_id} "
#                   f"AND TRUNC(sample_date) = DATE '{date}' "
#                   f"AND depth1 = {dep1} "
#                   f"AND depth2 = {dep2} "
#                  )
#            ws_df = pd.read_sql(sql, eng)
#            assert len(ws_df) == 1, (stn_cde, ws_df)
#            ws_id = ws_df['water_sample_id'].iloc[0]
#            ws_ids.append(ws_id)
#            
#        assert len(ws_ids) == 2
#        assert ws_ids[0] != ws_ids[1]
#    
#        # Move all chem values to first water sample. The choice is arbitrary
#        cor_ws = ws_ids[0]
#        bad_ws = ws_ids[1]        
#        sql = (f"UPDATE resa2.water_chemistry_values2 "
#               f"SET sample_id = {cor_ws} "
#               f"WHERE sample_id = {bad_ws}")
#        eng.execute(sql)
#        
#        # Delete "bad" ws
#        sql = (f"DELETE FROM resa2.labware_wsid "
#               f"WHERE water_sample_id = {bad_ws}")
#        eng.execute(sql)
#        
#        sql = (f"DELETE FROM resa2.water_samples "
#               f"WHERE water_sample_id = {bad_ws}")
#        eng.execute(sql)
#        
#        # Assign correct date and depths to "good" ws
#        sql = (f"UPDATE resa2.water_samples "
#               f"SET "
#               f"  sample_date = TO_DATE('{cor_date}', 'yyyy-mm-dd'), "
#               f"  depth1 = 0, "
#               f"  depth2 = 0 "
#               f"WHERE water_sample_id = {cor_ws}"
#              )
#        eng.execute(sql)

## 2. Synchronise with AM

Roar has fixed the duplicated dates in AM (see e-mail received 10.06.2020 at 15.46). I have then exported the latest data from AM and removed samples collected as part of other projects (again, based on the spreadsheet from Liv Bente). This leaves a spreadsheet with a fairly complete-looing record of 1001 water samples. This seems reasonable: there are 1003 stations in the project in total, but one lake has "disappeared" and Store Ljøsvatnet was sampled by mistake in 2019 in place of Ljøsvannet (which was part of the 1995 survey). Both are part of the 1000 Lakes project, but the latter was not sampled in 2019. 1001 samples therefore seems correct.

The code below attempts to match this spreadsheet to the data in AM, with the following aims:

 1. Check that the samples in AM can be correctly identified in RESA and update sample metadata if necessary
 
 2. Update the values in RESA to match those in AM
 
 2. Add the relevant RESA water sample IDs to the `SAMPLE_SELECTIONS` table, so that just the 2019 1000 Lakes dataset is available from RESA.

## 2.1. Create sample selection

In [3]:
ss_df = pd.DataFrame({'sample_selection_id': [67],
                      'project_id': [4530],
                      'name': ['Nasjonal Innsjøundersøkelse 2019']})
ss_df

Unnamed: 0,sample_selection_id,project_id,name
0,67,4530,Nasjonal Innsjøundersøkelse 2019


In [19]:
#ss_df.to_sql('sample_selection_definitions',
#             schema='resa2',
#             if_exists='append',
#             index=False,
#             con=eng,
#            )

### 2.2. Match water samples and populate "sample selection"

In [5]:
# Read AM data
am_xls = r"../../../1000_Lakes_AM_Export_2020-06-11_Project_Only.xlsx"
am_df = pd.read_excel(am_xls, sheet_name='WaterChemistry')
print(len(am_df))
am_df.head()

1001


Unnamed: 0,project_id,project_name,station_id,station_code,station_name,sample_date,excel_date,depth1,depth2,Al,...,PO4-P,SiO2,SO4,Temperatur,TOC,TOTN,TOTP,V,Zn,Zn.1
0,12433,Nasjonal Innsjøundersøkelse 2019,26472,1001-1-55,Skeivatnet,30.10.2019 00.00.00,2019-10-30,0,0,300,...,< 1,2.51,1.32,8.2,9.2,390,5,,8.3,
1,12433,Nasjonal Innsjøundersøkelse 2019,71862,1001-2-204,Vassvann,29.10.2019 00.00.00,2019-10-29,0,0,130,...,< 1,2.8,4.13,8.5,6.8,380,6,,8.4,
2,12433,Nasjonal Innsjøundersøkelse 2019,26474,1002-1-62,Vråvatn,30.10.2019 00.00.00,2019-10-30,0,0,220,...,10,2.24,2.19,7.6,7.6,520,15,,5.4,
3,12433,Nasjonal Innsjøundersøkelse 2019,26475,1003-1-16,HOH 130,30.10.2019 00.00.00,2019-10-30,0,0,160,...,< 1,1.87,1.69,8.1,3.6,280,3,,3.2,
4,12433,Nasjonal Innsjøundersøkelse 2019,26476,1003-1-18,HOH 14,30.10.2019 00.00.00,2019-10-30,0,0,110,...,< 1,1.1,2.45,8.6,3.3,270,3,,4.4,


In [3]:
# Match station IDs between RESA and AM
am_stns = nivapy.da.select_ndb_project_stations([12433], eng, drop_dups=True)
am_stns.rename({"station_id": "am_id"}, inplace=True, axis=1)

# Get RESA IDs for AM stations
bind_stns = ",".join("(1, %d)" % i for i in am_stns["am_id"].unique())
sql = (
    "SELECT station_id as am_id, "
    "  local_pk as resa_id "
    " FROM nivadatabase.datasource_station "
    "WHERE datasource_id = 11 "
    "AND (1, station_id) in (%s)" % bind_stns
)
stn_link = pd.read_sql(sql, eng)

# Join
stn_df = pd.merge(am_stns, stn_link, how="left", on="am_id")

print(len(stn_df), "stations in the project")

stn_df.head()

1003 stations in the project


Unnamed: 0,am_id,station_code,station_name,station_type,longitude,latitude,resa_id
0,26070,221-1-2,Langtjern,Innsjø,11.850274,59.808643,10
1,26071,101-2-7,Hokksjøen,Innsjø,11.563586,59.004423,9
2,26072,402-2-13,Sætertjern,Innsjø,12.446711,60.060222,11
3,26073,419-1-25,Mjøgsjøen,Innsjø,11.842186,60.328578,12
4,26074,425-2-2,Kottern,Innsjø,12.517008,60.590729,13


In [14]:
# Check water samples match in RESA
ws_list = []
for idx, row in am_df.iterrows():
    am_id = row['station_id']
    am_date = row['excel_date'].strftime("%Y-%m-%d")
    am_dep1 = row['depth1']
    am_dep2 = row['depth2']
    
    resa_id = stn_df.query("am_id == @am_id")['resa_id'].iloc[0]
    
    sql = (f"SELECT * FROM resa2.water_samples "
           f"WHERE station_id = {resa_id} "
           f"AND sample_date = DATE '{am_date}' "
           f"AND depth1 = {am_dep1} "
           f"AND depth2 = {am_dep2}"
          )
    ws_df = pd.read_sql(sql, eng)
    
    assert len(ws_df) == 1, (resa_id, ws_df)

    # Get RESA ws id
    ws_list.append(ws_df['water_sample_id'].iloc[0])
    
# Add to sample selections
ss_df = pd.DataFrame({'water_sample_id': ws_list})
ss_df['sample_selection_id'] = 67
print(len(ss_df))

ss_df.head()

1001


Unnamed: 0,water_sample_id,sample_selection_id
0,871397,67
1,874269,67
2,871430,67
3,871394,67
4,871393,67


In [23]:
#ss_df.to_sql('sample_selections',
#             schema='resa2',
#             if_exists='append',
#             index=False,
#             con=eng,
#            )

### 2.3. Update RESA chemistry to match AM

Based on previous checking, TOTN and pH need updating in RESA.

In [11]:
## RESA par IDs
#ph_par_id = 1
#totn_par_id = 4
#
## Get RESA methods for pH
#sql = (f"SELECT wc_method_id FROM resa2.wc_parameters_methods "
#       f"WHERE wc_parameter_id = {ph_par_id}"
#      )
#ph_methods = pd.read_sql(sql, eng)['wc_method_id'].astype(str)
#ph_methods = ','.join(ph_methods)
#
## Get RESA methods for TOTN
#sql = (f"SELECT wc_method_id FROM resa2.wc_parameters_methods "
#       f"WHERE wc_parameter_id = {totn_par_id}"
#      )
#totn_methods = pd.read_sql(sql, eng)['wc_method_id'].astype(str)
#totn_methods = ','.join(totn_methods)
#
## Loop over data
#ph_list = []
#totn_list = []
#for idx, row in am_df.iterrows():
#    # Get RESA WS ID
#    am_id = row['station_id']
#    am_date = row['excel_date'].strftime("%Y-%m-%d")
#    am_dep1 = row['depth1']
#    am_dep2 = row['depth2']
#    
#    resa_id = stn_df.query("am_id == @am_id")['resa_id'].iloc[0]
#    
#    sql = (f"SELECT * FROM resa2.water_samples "
#           f"WHERE station_id = {resa_id} "
#           f"AND sample_date = DATE '{am_date}' "
#           f"AND depth1 = {am_dep1} "
#           f"AND depth2 = {am_dep2}"
#          )
#    ws_df = pd.read_sql(sql, eng)
#    
#    assert len(ws_df) == 1, (resa_id, ws_df)
#    
#    ws_id = ws_df['water_sample_id'].iloc[0]
#    
#    # Get pH
#    sql = (f"SELECT * FROM resa2.water_chemistry_values2 "
#           f"WHERE sample_id = {ws_id} "
#           f"AND method_id IN ({ph_methods})"
#          )
#    ph_df = pd.read_sql(sql, eng)
#    
#    assert len(ph_df) == 1
#    
#    ph_list.append(ph_df['value'].iloc[0])
#    
#    # Update RESA pH
#    am_ph = row['pH']
#    resa_val_id = ph_df['value_id'].iloc[0]
#    sql = (f"UPDATE resa2.water_chemistry_values2 "
#           f"SET value = {am_ph}, flag1 = NULL "
#           f"WHERE value_id = {resa_val_id}"
#          )
#    eng.execute(sql)    
#    
#    # Get TOTN
#    sql = (f"SELECT * FROM resa2.water_chemistry_values2 "
#           f"WHERE sample_id = {ws_id} "
#           f"AND method_id IN ({totn_methods})"
#          )
#    totn_df = pd.read_sql(sql, eng)
#    
#    # Get the most recent TOTN value
#    totn_df = totn_df.loc[[totn_df['entered_date'].idxmax()]]
#    
#    assert len(totn_df) == 1, totn_df
#    
#    totn_list.append(totn_df['value'].iloc[0])
#    
#    # Update RESA TOTN
#    am_totn = row['TOTN']
#    resa_val_id = totn_df['value_id'].iloc[0]
#    sql = (f"UPDATE resa2.water_chemistry_values2 "
#           f"SET value = {am_totn}, flag1 = NULL "
#           f"WHERE value_id = {resa_val_id}"
#          )
#    eng.execute(sql) 