### USGS water data scraping & analysis

In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd

### Method 1: Using wget to acquire data

In [2]:
def unpack_dataset():
    ! wget "https://waterservices.usgs.gov/nwis/iv/?format=waterml,2.0&stateCd=hi&parameterCd=00060,00065&siteStatus=all" --output-document=Hawaii.xml
    ! mkdir -p data
    ! mv *.xml data

In [3]:
unpack_dataset()

--2020-06-17 08:50:43--  https://waterservices.usgs.gov/nwis/iv/?format=waterml,2.0&stateCd=hi&parameterCd=00060,00065&siteStatus=all
Resolving waterservices.usgs.gov (waterservices.usgs.gov)... 137.227.241.74, 137.227.232.147, 137.227.252.8
Connecting to waterservices.usgs.gov (waterservices.usgs.gov)|137.227.241.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/xml]
Saving to: ‘Hawaii.xml’

Hawaii.xml              [    <=>             ] 790.49K  1.17MB/s    in 0.7s    

2020-06-17 08:50:44 (1.17 MB/s) - ‘Hawaii.xml’ saved [809462]



### Method 2: Using request to download data

In [4]:
import requests
URL = "https://waterservices.usgs.gov/nwis/iv/?format=waterml,2.0&stateCd=hi&parameterCd=00060,00065&siteStatus=all"
response = requests.get(URL)
with open('data/hawaii2.xml', 'wb') as file:
    file.write(response.content)

### Write as a function (e.g. get_state_data() ) which takes in a state abbreviation

In [5]:
def get_state_data(state):
    """ The input state should be abbrevation in lower format. 
    For example, California should be presented as: ca"""
    state = state
    URL = "https://waterservices.usgs.gov/nwis/iv/?format=waterml,2.0&stateCd=%s&parameterCd=00060,00065&siteStatus=all"%state
    response = requests.get(URL)
    with open('data/%s.xml'%state, 'wb') as file:
        file.write(response.content)
        print('%s data donwloaded'%state)

In [6]:
# download the Alabama data
get_state_data('al')

al data donwloaded


### Convert xml file to csv

In [7]:
from xml.etree import ElementTree
import csv

In [8]:
# read xml file
tree = ElementTree.parse('data/Hawaii.xml')

In [9]:
# create csv file
hawaii = open('data/Hawaii.csv', 'w', newline='', encoding='utf-8')
csvwriter = csv.writer(hawaii)

### General functions to extract data

This function enables people to get USGS water data in specific state, time period, water type and current status.

In [10]:
def get_data(state, start_date=None, end_date=None, site_type=None, status='all'):
    """ The input state should be abbrevation in lower format. For example, California should be presented as: ca.
    Start/end date should be presented in yyyy-mm-dd format. Water type is presented in ST-abbrevation. For example, 
    Tidal stream should be presented in: ST-TS. Options for status is: 'all', 'active', 'inactive'."""
    if start_date:
        start_date = '&startDT={}'.format(start_date)
    if end_date:
        end_date = '&endDT={}'.format(end_date)
    if site_type:
        site_type = '&siteType={}'.format(site_type)
    if status:
        status = '&siteStatus={}'.format(status)
        
    URL = 'https://waterservices.usgs.gov/nwis/iv/?format=waterml,2.0&stateCd={}{}{}&parameterCd=00060,00065{}{}'.\
    format(state, "" if start_date is None else start_date, "" if end_date is None else end_date, 
           "" if site_type is None else site_type, "" if status is None else status)

    response = requests.get(URL)
    with open('data/%s.xml'%state, 'wb') as file:
        file.write(response.content)
        print('%s data donwloaded'%state)

In [11]:
get_data('ca', '2018-06-23', '2018-06-28','ST-TS', 'active')

ca data donwloaded


### Wrap into python package