# BioPuff QC and Dataset Creator

## 2021 Spring
- drop bad parameters
- drop ship deck data (deploy time)
- split data between bottom, profile, sst
    - three files and three datasets
    - combine all floats for a single deployment season dataset (erddap)
    - add deploy location and other meta information
    
### Unique commentary
- POPS3 Failed on Deck
- POPS2 Only provided a partial download before loosing connection
- POPS6 Had a Failed Pressure Sensor
- All units had a weight/timing error which didn't provide a profile
- GPS is only available Daily for SST - so interpolate linearly for other timesteps

### Final Dataset ERDDAP Presentation?
- one dataset for bottom, one for sfc (with gps data) **<-- for now this has no QC**  
or  
- a dataset for each unit with a sfc dataset for each unit (20+ datasets) ***<- easier for a plotlydash driven dashboard and has QARTOD QC approach***
- EDD/SDIG keeps raw dataset alive on their erddap (how long?) 
- archive in netcdf? or csv + expressive erddap-xml? **wait to make netcdf until gear is out of water**

In [1]:
from erddapy import ERDDAP
import pandas as pd
import datetime
import numpy as np

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
server_url = 'http://heron.pmel.noaa.gov:8080/erddap'

In [10]:
e = ERDDAP(server=server_url)
dfname = pd.read_csv(e.get_search_url(response='csv', search_for=f'POPS AND GPS_'))

gpdf_all = {}
for dataset in sorted(dfname['Dataset ID'].values):
    print(dataset)

    d = ERDDAP(server=server_url,
               protocol='tabledap',
               response='csv',
              )

    d.dataset_id=dataset

    gpdf = d.to_pandas(
        index_col='time (UTC)',
        parse_dates=True,
        skiprows=(1,)  # units information can be dropped.
    )

    gpdf.sort_index(inplace=True)
    gpdf.columns = [x[1].split()[0] for x in enumerate(gpdf.columns)]

    if not 'GPS' in dataset:
        pass
    else:
        gpdf['LatDD'] = ((gpdf['Latitude']/100-np.floor(gpdf['Latitude']/100))*100 / 60) + gpdf['Latitude']/100
        gpdf['LonDD'] = -1*(((gpdf['Longitude']/100-np.floor(gpdf['Longitude']/100))*100 / 60) + gpdf['Longitude']/100)

    gpdf['timeseries_id'] = dataset.split('_')[0]

    gpdf_all.update({dataset:gpdf})

POPS0001_GPS_0001
POPS0002_GPS_0002
POPS0004_GPS_0004
POPS0005_GPS_0005
POPS0006_GPS_0006
POPS0007_GPS_0007
POPS0008_GPS_0008
POPS0009_GPS_0009
POPS0010_GPS_0010


In [11]:
e = ERDDAP(server=server_url)
dfname = pd.read_csv(e.get_search_url(response='csv', search_for=f'POPS_'))

df_all = {}
for dataset in sorted(dfname['Dataset ID'].values):
    print(dataset)

    d = ERDDAP(server=server_url,
               protocol='tabledap',
               response='csv',
              )

    d.dataset_id=dataset

    pdf = d.to_pandas(
        index_col='time (UTC)',
        parse_dates=True,
        skiprows=(1,)  # units information can be dropped.
    )

    pdf.sort_index(inplace=True)
    pdf.columns = [x[1].split()[0] for x in enumerate(pdf.columns)]

    if not 'GPS' in dataset:
        pdf['Pressure_Bar'] = pdf['Depth'].apply(lambda x: int(x, 16)/100 )
        pdf['Temp_DegC_0'] = pdf['TempProbe0'].apply(lambda x: int(x, 16)/1000 )
        pdf['Temp_DegC_1'] = pdf['TempProbe1'].apply(lambda x: int(x, 16)/1000 )
    else:
        pdf['LatDD'] = ((pdf['Latitude']/100-np.floor(pdf['Latitude']/100))*100 / 60) + pdf['Latitude']/100
        pdf['LonDD'] = -1*(((pdf['Longitude']/100-np.floor(pdf['Longitude']/100))*100 / 60) + pdf['Longitude']/100)
    
    pops_deploy = {
        '0001':[58.9931667,-175.00267,datetime.date(2021,5,13),131,datetime.date(2021,9,15)],
        '0002':[55.442,-163.74017,datetime.date(2021,5,7),83,datetime.date(2021,9,15)],
        '0004':[59.69,-176.705,datetime.date(2021,5,13),139,datetime.date(2021,9,15)],
        '0005':[56.4471667,-168.49617,datetime.date(2021,5,13),121,datetime.date(2021,9,15)], #startdate looks later than records? by 9 days
        '0006':[56.7043333,-171.2195,datetime.date(2021,5,14),'NaN',datetime.date(2021,9,15)],
        '0007':[61.0698333,-177.773,datetime.date(2021,5,12),141.5,datetime.date(2021,9,15)],
        '0008':[59.243995,-169.41832,datetime.date(2021,5,10),54.9,datetime.date(2021,9,15)],
        '0009':[60.7788333,-174.49733,datetime.date(2021,5,12),91,datetime.date(2021,9,15)],
        '0010':[55.2358333,-166.1298,datetime.date(2021,5,15),130,datetime.date(2021,9,15)]
    }
    
    if '0006' in dataset.split('_')[-1]:
        pdf['Pressure_Bar'] = np.nan
        
    pdf['latitude'] = pops_deploy[dataset.split('_')[-1]][0]
    pdf['longitude'] = pops_deploy[dataset.split('_')[-1]][1]
    pdf['timeseries_id'] = dataset.split('_')[0]
    pdf = pdf[pops_deploy[dataset.split('_')[-1]][2]:].copy()
    pdf.loc[:,'measurement_type'] = 'bottom'
    pdf.loc[pops_deploy[dataset.split('_')[-1]][4]:,'measurement_type'] = 'surface'
        
    df_all.update({dataset:pdf})

POPS0001_POPS_0001
POPS0002_POPS_0002
POPS0004_POPS_0004
POPS0005_POPS_0005
POPS0006_POPS_0006
POPS0007_POPS_0007
POPS0008_POPS_0008
POPS0009_POPS_0009
POPS0010_POPS_0010


In [12]:
df_all_merged=pd.DataFrame()
for dataset_id in df_all.keys():
    df_all_merged = df_all_merged.append(df_all[dataset_id])
    
df_all_merged

Unnamed: 0_level_0,Epoch_Time,Depth,TempProbe0,TempProbe1,Pressure_Bar,Temp_DegC_0,Temp_DegC_1,latitude,longitude,timeseries_id,measurement_type
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-05-13 00:10:00+00:00,609C6E58,32d6,06b2,06ba,130.14,1.714,1.722,58.993167,-175.00267,POPS0001,bottom
2021-05-13 01:10:00+00:00,609C7C68,32e5,071f,0728,130.29,1.823,1.832,58.993167,-175.00267,POPS0001,bottom
2021-05-13 02:10:00+00:00,609C8A78,32f7,0715,071e,130.47,1.813,1.822,58.993167,-175.00267,POPS0001,bottom
2021-05-13 03:10:00+00:00,609C9888,330b,0707,070f,130.67,1.799,1.807,58.993167,-175.00267,POPS0001,bottom
2021-05-13 04:10:00+00:00,609CA698,331e,0717,0721,130.86,1.815,1.825,58.993167,-175.00267,POPS0001,bottom
...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 19:10:00+00:00,61B10308,0014,0134,0146,0.20,0.308,0.326,55.235833,-166.12980,POPS0010,surface
2021-12-08 20:10:00+00:00,61B11118,0013,fe80,fe5c,0.19,65.152,65.116,55.235833,-166.12980,POPS0010,surface
2021-12-08 21:10:00+00:00,61B11F28,0015,0031,0062,0.21,0.049,0.098,55.235833,-166.12980,POPS0010,surface
2021-12-08 22:10:00+00:00,61B12D38,0011,0214,0235,0.17,0.532,0.565,55.235833,-166.12980,POPS0010,surface


## QC Data - Follow QUARTOD (1,2,3,4,9)

**First use instrument specific information to filter good/bad data**

- drop hex columns
- remove known failed sensor

In [13]:
df_all_merged.loc[df_all_merged['timeseries_id'] == 'POPS0006','Pressure_Bar'] = np.nan
df_all_merged.drop(columns=['Epoch_Time','Depth','TempProbe0','TempProbe1'],inplace=True)

#remove timezone info for ioos_qc and for datetimecomparisons to eliminate bad data :(
df_all_merged.index = df_all_merged.index.tz_convert(None) #so make sure its UTC before doing this step
df_all_merged.loc[(df_all_merged['timeseries_id'] == 'POPS0010') & (df_all_merged.index > datetime.datetime(2021,10,23,0,0,0)),'Pressure_Bar'] = np.nan
df_all_merged.loc[(df_all_merged['timeseries_id'] == 'POPS0010') & (df_all_merged.index > datetime.datetime(2021,10,23,0,0,0)),'Temp_DegC_0'] = np.nan
df_all_merged.loc[(df_all_merged['timeseries_id'] == 'POPS0010') & (df_all_merged.index > datetime.datetime(2021,10,23,0,0,0)),'Temp_DegC_1'] = np.nan

df_all_merged = df_all_merged.reset_index()

### follow ioos_qc -> QUARTOD for basic range checks and add an aggregate flag for now

In [14]:
from ioos_qc import qartod
from ioos_qc.streams import PandasStream
from ioos_qc.stores import PandasStore
from ioos_qc.config import Config
from ioos_qc.results import collect_results

In [15]:
qc_config = {
    'Pressure_Bar': {
        'qartod': {
          "gross_range_test": {
            "fail_span": [-0.5,400],
            "suspect_span": [-0.1,300]
          },
          # "flat_line_test": { #not valid for bottom data
          #   "tolerance": 0.001,
          #   "suspect_threshold": 10800,
          #   "fail_threshold": 21600
          # },
          "rate_of_change_test": {
            "threshold": 0.2
          },
          "spike_test": {
            "suspect_threshold": 0.8,
            "fail_threshold": 3
          },
          "aggregate": {}
        }
    },
    'Temp_DegC_0': {
        'qartod': {
          "gross_range_test": {
            "fail_span": [-2,25],
            "suspect_span": [-2,14]
          },
          # "flat_line_test": { #not valid for bottom data
          #   "tolerance": 0.001,
          #   "suspect_threshold": 10800,
          #   "fail_threshold": 21600
          # },
          "rate_of_change_test": {
            "threshold": 0.1
          },
          "spike_test": {
            "suspect_threshold": 0.8,
            "fail_threshold": 3
          },
          "aggregate": {}
        }
    },
    'Temp_DegC_1': {
        'qartod': {
          "gross_range_test": {
            "fail_span": [-2,25],
            "suspect_span": [-2,14]
          },
          # "flat_line_test": { #not valid for bottom data
          #   "tolerance": 0.001,
          #   "suspect_threshold": 10800,
          #   "fail_threshold": 21600
          # },
          "rate_of_change_test": {
            "threshold": 0.1
          },
          "spike_test": {
            "suspect_threshold": 0.8,
            "fail_threshold": 3
          },
          "aggregate": {}
        }
    }
}
c = Config(qc_config)

In [16]:
# Setup the stream
ps = PandasStream(df_all_merged, time='time (UTC)', lat='latitude', lon='longitude', z='Pressure_Bar')

# Run the tests by passing in a Config object
results = ps.run(c)

store = PandasStore(results)

Could not run "qartod.aggregate: aggregate() missing 1 required positional argument: 'results'
Could not run "qartod.aggregate: aggregate() missing 1 required positional argument: 'results'
Could not run "qartod.aggregate: aggregate() missing 1 required positional argument: 'results'


In [17]:
# Compute any aggregations
store.compute_aggregate(name='rollup_qc')  # Appends to the results internally

# Write only the test results to the store
results_store = store.save(write_data=False, write_axes=False)

# Append columns from qc results back into the data - stick to simple case of aggregate only for now
results_store = pd.concat([df_all_merged, 
                           results_store], axis=1)

## keep 1,2 for each parameter and address parameters/test independently, eventually send flag to ERDDAP?
for keys in results_store.keys():
    if 'qartod' in keys:
        # print(keys)
        results_store.loc[(results_store[keys] != 1) & (results_store[keys] != 2),keys.split('_qartod')[0]] = np.nan
results_store = results_store.loc[:, ~results_store.columns.str.contains('qartod')]

In [18]:
df_all_merged = results_store.copy().set_index('time (UTC)')
df_all_merged.index = df_all_merged.index.tz_localize('utc') #so make sure its UTC before doing this step

**Filter and Interpolate GPS data to hourly**

**split bottom and top (and eventually profile) and merge gps with sfc timeseries data**

In [19]:
gpdf_all_merged=pd.DataFrame()
for dataset_id in gpdf_all.keys():
    gpdf_all_merged = gpdf_all_merged.append(gpdf_all[dataset_id])
    
gpdf_all_merged = gpdf_all_merged[['LatDD','LonDD','timeseries_id']]


In [20]:
df_bottom = df_all_merged[df_all_merged.measurement_type == 'bottom']
df_bottom.drop(columns='measurement_type',inplace=True)
df_sfc = df_all_merged[df_all_merged.measurement_type == 'surface']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
gps_sfc = pd.DataFrame()
for i,groups in gpdf_all_merged.groupby(gpdf_all_merged.timeseries_id):
    tmp = groups.resample('1H').mean().interpolate()
    tmp['timeseries_id']=i
    tdf_sfc = df_sfc.groupby(df_sfc.timeseries_id).get_group(i).resample('1H').mean()
    gps_sfc = pd.concat([gps_sfc,pd.merge(tmp,tdf_sfc,left_index=True,right_index=True)])

In [22]:
gps_sfc.rename(columns={'latitude':'deploy_latitude',
                        'longitude':'deploy_longitude',
                        'LatDD':'latitude',
                        'LonDD':'longitude'},inplace=True)

In [23]:
df_bottom

Unnamed: 0_level_0,Pressure_Bar,Temp_DegC_0,Temp_DegC_1,latitude,longitude,timeseries_id
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-05-13 00:10:00+00:00,130.14,1.714,1.722,58.993167,-175.00267,POPS0001
2021-05-13 01:10:00+00:00,130.29,1.823,1.832,58.993167,-175.00267,POPS0001
2021-05-13 02:10:00+00:00,130.47,1.813,1.822,58.993167,-175.00267,POPS0001
2021-05-13 03:10:00+00:00,130.67,1.799,1.807,58.993167,-175.00267,POPS0001
2021-05-13 04:10:00+00:00,130.86,1.815,1.825,58.993167,-175.00267,POPS0001
...,...,...,...,...,...,...
2021-09-14 19:10:00+00:00,131.22,4.779,4.783,55.235833,-166.12980,POPS0010
2021-09-14 20:10:00+00:00,131.53,4.779,4.782,55.235833,-166.12980,POPS0010
2021-09-14 21:10:00+00:00,131.74,4.779,4.782,55.235833,-166.12980,POPS0010
2021-09-14 22:10:00+00:00,131.92,4.779,4.783,55.235833,-166.12980,POPS0010


In [24]:
gps_sfc.to_csv('2021_SpringDeployed_BS_BioPUFFS_sfc.csv')
df_bottom.to_csv('2021_SpringDeployed_BS_BioPUFFS_btm.csv')

In [25]:
#output as seperate files
for groups, gdata in gps_sfc.groupby('timeseries_id'):
    gdata.to_csv('2021_SpringDeployed_BS_BioPUFFS'+groups+'_sfc.csv')
for groups, gdata in df_bottom.groupby('timeseries_id'):
    df_bottom.to_csv('2021_SpringDeployed_BS_BioPUFFS'+groups+'_btm.csv')
