## TODO:
 * Add (optional) parameters for
   * Properties (Characteristics)
   * Organization
   * Activity ID
 * Get Units and other measurement details
 * Get Organization, Monitoring Location, and Activity information
 * Make into a full Connector with Input, Filters and an Output plugins
   * A filter to look up (validatate) Characteristics in Domain Values table at WQX
   * A date filter for the date range
   * Create YAML template

In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
%%writefile wqpinput/GeoEDF/connector/input/WQPInput.py
# %load wqpinput/GeoEDF/connector/input/WQPInput.py
#!/usr/bin/env python

from geoedfframework.utils.GeoEDFError import GeoEDFError
from geoedfframework.GeoEDFPlugin import GeoEDFPlugin

import requests
import os

""" Module for implementing the WQP input connector plugin. WQP (Water Quality Portal)
    is a public web service (REST API) operated by the US EPA and USGS.
    This module will implement the get() method required for all input plugins.
"""

class WQPInput(GeoEDFPlugin):

    base_url = "https://www.waterqualitydata.us/data"
    target_path = "data"
 
    __optional_params = ['start_date','end_date']
    __required_params = ['site_id']

    # we use just kwargs since we need to be able to process the list of attributes
    # and their values to create the dependency graph in the GeoEDFInput super class
    def __init__(self, **kwargs):

        # list to hold all the parameter names; will be accessed in super to
        # construct dependency graph
        self.provided_params = self.__required_params + self.__optional_params

        # check that all required params have been provided
        for param in self.__required_params:
            if param not in kwargs:
                raise GeoEDFError('Required parameter %s for WQPInput not provided' % param)

        # set all required parameters
        for key in self.__required_params:
            setattr(self,key,kwargs.get(key))

        # set optional parameters
        for key in self.__optional_params:
            # if key not provided in optional arguments, defaults value to None
            setattr(self,key,kwargs.get(key,None))

        # class super class init
        super().__init__()

    # each Input plugin needs to implement this method
    # if error, raise exception; if not, return True

    def get(self):
        # set defaults if none provided
        if (self.start_date == None):
            self.start_date = ''
        if (self.end_date == None):
            self.end_date = '05-01-2020'

        # build URL for REST API call
        wqp_url = self.base_url+"/Result/search?siteid="+self.site_id+"&StartDateLo="+self.start_date+"&StartDateHi="+self.end_date+"&mimeType=csv"

        try:
            # do REST API GET call
            results = requests.get(url=wqp_url, stream=True)
            # target_path is (re)set by the connector input instantiation
            out_path = '%s/%s.csv' % (self.target_path,self.site_id)
            with open(out_path,'wb') as out_file:
                for chunk in results.iter_content(chunk_size=1024*1024):
                    out_file.write(chunk)
        except GeoEDFError:
            raise
        except:
            raise

Overwriting wqpinput/GeoEDF/connector/input/WQPInput.py


In [13]:
from datetimefilter.GeoEDF.connector.filter.DateTimeFilter import DateTimeFilter

In [5]:
date_time_1 = DateTimeFilter(pattern="%m-%d-%Y",start="1/1/1970")
date_time_1.filter()
start_date = date_time_1.values[0]

In [6]:
date_time_2 = DateTimeFilter(pattern="%m-%d-%Y",start="8/7/2020")
date_time_2.filter()
end_date = date_time_2.values[0]

In [17]:
from wqpinput.GeoEDF.connector.input.WQPInput import WQPInput

In [8]:
wqp = WQPInput(site_id="USGS-03206000",start_date=start_date,end_date=end_date)
wqp.set_output_path('.')
wqp.get()

In [9]:
import pandas as pd
results = pd.read_csv ("./USGS-03206000.csv")

In [10]:
results.columns

Index(['OrganizationIdentifier', 'OrganizationFormalName',
       'ActivityIdentifier', 'ActivityTypeCode', 'ActivityMediaName',
       'ActivityMediaSubdivisionName', 'ActivityStartDate',
       'ActivityStartTime/Time', 'ActivityStartTime/TimeZoneCode',
       'ActivityEndDate', 'ActivityEndTime/Time',
       'ActivityEndTime/TimeZoneCode',
       'ActivityDepthHeightMeasure/MeasureValue',
       'ActivityDepthHeightMeasure/MeasureUnitCode',
       'ActivityDepthAltitudeReferencePointText',
       'ActivityTopDepthHeightMeasure/MeasureValue',
       'ActivityTopDepthHeightMeasure/MeasureUnitCode',
       'ActivityBottomDepthHeightMeasure/MeasureValue',
       'ActivityBottomDepthHeightMeasure/MeasureUnitCode', 'ProjectIdentifier',
       'ActivityConductingOrganizationText', 'MonitoringLocationIdentifier',
       'ActivityCommentText', 'SampleAquifer', 'HydrologicCondition',
       'HydrologicEvent', 'SampleCollectionMethod/MethodIdentifier',
       'SampleCollectionMethod/MethodIden

In [11]:
results[['CharacteristicName','ResultMeasureValue','ActivityStartDate']]

Unnamed: 0,CharacteristicName,ResultMeasureValue,ActivityStartDate
0,"Temperature, water",12.00,1972-11-07
1,"Temperature, air, deg C",13.00,1972-11-07
2,"Stream flow, instantaneous",71300.00,1972-11-07
3,"Stream flow, instantaneous",2020.00,1972-11-07
4,"Temperature, water",3.00,1971-01-25
...,...,...,...
57,"Hardness, Ca, Mg",40.00,1971-06-03
58,"Hardness, non-carbonate",1.00,1971-06-03
59,Calcium,8.00,1971-06-03
60,Chloride,15.00,1971-06-03
