In [46]:
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from html_table_parser import parser_functions as parse
from time import sleep
import pandas as pd
import urllib.request
import os

### Get a list of all counties in WI

In [33]:
# Get the list of all counties in WI
county_url = "https://dnr.wi.gov/lakes/clmn/"
county_html = requests.get(county_url)

In [34]:
# .text returns the request content in Unicode
county_html.text[:500]

'\r\n\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n\r\n<head><meta http-equiv="content-type" content="text/html;charset=utf-8" /><link href="http://dnr.wi.gov/favicon.ico" type="image/x-icon" rel="icon" /><link href="http://dnr.wi.gov/favicon.ico" type="image/x-icon" rel="shortcut icon" /><title>\r\n\tCitizen Lake Monitoring Network\r\n</title><!-- BEGIN global_head.inc ( /includes/global_head'

In [35]:
county_soup = bs(county_html.text, 'html.parser')

In [336]:
county_soup.find_all(name='li', attrs={'class':'multiColList'})

[<li class="multiColList"><a href="Stations.aspx?location=1">
                 Adams County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=2">
                 Ashland County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=3">
                 Barron County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=4">
                 Bayfield County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=5">
                 Brown County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=6">
                 Buffalo County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=7">
                 Burnett County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=8">
                 Calumet County
             </a></li>,
 <li class="multiColList"><a href="Stations.aspx?location=9"

In [361]:
# Extract list of all counties, correctly formatted 
counties = []
for entry in county_soup.find_all(name='li', attrs={'class':'multiColList'}):
    counties.append(entry.text.replace('\r\n', '').replace('                ','').replace('            ','').\
                    replace(' County','').replace('Fond du Lac','Fond Du Lac').replace('Saint Croix','St. Croix')) 
    
counties

['Adams',
 'Ashland',
 'Barron',
 'Bayfield',
 'Brown',
 'Buffalo',
 'Burnett',
 'Calumet',
 'Chippewa',
 'Clark',
 'Columbia',
 'Crawford',
 'Dane',
 'Dodge',
 'Door',
 'Douglas',
 'Dunn',
 'Eau Claire',
 'Florence',
 'Fond Du Lac',
 'Forest',
 'Grant',
 'Green',
 'Green Lake',
 'Iowa',
 'Iron',
 'Jackson',
 'Jefferson',
 'Juneau',
 'Kenosha',
 'Kewaunee',
 'La Crosse',
 'Lafayette',
 'Langlade',
 'Lincoln',
 'Manitowoc',
 'Marathon',
 'Marinette',
 'Marquette',
 'Menominee',
 'Milwaukee',
 'Monroe',
 'Oconto',
 'Oneida',
 'Outagamie',
 'Ozaukee',
 'Pepin',
 'Pierce',
 'Polk',
 'Portage',
 'Price',
 'Racine',
 'Richland',
 'Rock',
 'Rusk',
 'St. Croix',
 'Sauk',
 'Sawyer',
 'Shawano',
 'Sheboygan',
 'Taylor',
 'Trempealeau',
 'Vernon',
 'Vilas',
 'Walworth',
 'Washburn',
 'Washington',
 'Waukesha',
 'Waupaca',
 'Waushara',
 'Winnebago',
 'Wood']

### Get all reports containing CHLA readings

Get all station numbers

In [38]:
# find station number in from table column station ID 
# All counties: https://dnr.wi.gov/lakes/clmn/Stations.aspx?location=0 
# By county: https://dnr.wi.gov/lakes/clmn/Stations.aspx?location=1 


stationIDs = []

def stationID_extract(pageNumber):
    driver = webdriver.Firefox()
    station_url = 'https://dnr.wi.gov/lakes/clmn/Stations.aspx?location=0'
    driver.get(station_url)
    
    for i in range(pageNumber):
        station = driver.page_source
        station_soup = bs(station, "html.parser")
        station_table = station_soup.find('table',  { "class" : "greysuitsyou" })
        station_twod_array = parse.make2d(station_table)
        for n in range(2,len(station_twod_array)-1):
            stationIDs.append(station_twod_array[n][1])
        #sleep(1)
        # go to next page
        driver.find_element_by_id("ctl00_ctl00_LeftPageContent_gvStationTable_ctl01_LinkButton3").click()
    return stationIDs
    driver.close()
    


In [39]:
stationID_extract(40)

['504001',
 '443052',
 '643047',
 '703060',
 '183082',
 '184002',
 '443053',
 '553068',
 '10040059',
 '643220',
 '163120',
 '643042',
 '433364',
 '493229',
 '643444',
 '10031332',
 '643401',
 '493104',
 '213142',
 '10051075',
 '433035',
 '433120',
 '013159',
 '10028945',
 '10028946',
 '013037',
 '10031430',
 '10031431',
 '683297',
 '023124',
 '093053',
 '10021451',
 '10039551',
 '10049055',
 '10022318',
 '663045',
 '493056',
 '493058',
 '493057',
 '583164',
 '043122',
 '673119',
 '10012421',
 '10020976',
 '443373',
 '043174',
 '433354',
 '033182',
 '443324',
 '073067',
 '073122',
 '663112',
 '453282',
 '643121',
 '383209',
 '353077',
 '494002',
 '563058',
 '10051069',
 '663050',
 '693104',
 '10049162',
 '663128',
 '213186',
 '443187',
 '033139',
 '433372',
 '693041',
 '433247',
 '493122',
 '443121',
 '693106',
 '593067',
 '143311',
 '033137',
 '143122',
 '033134',
 '033131',
 '143034',
 '033130',
 '033133',
 '10048426',
 '10049214',
 '033132',
 '10048427',
 '083044',
 '233117',
 '10039

Report url example: stationNo, year1=1950&year2=2017 (regardless of most recent data)
http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=013159&year1=1950&year2=2017&format=csv
http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=10021087&year1=1950&year2=2017&format=csv
http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=013178&year1=1950&year2=2017&format=csv
http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=273120&year1=1950&year2=2017&format=csv

Detail page examples:
https://dnr.wi.gov/lakes/CLMN/Station.aspx?id=013159

In [92]:
def clean_reports(number_of_stations):
    """
    This module takes reports from each station and returns cleaned reports
    
    """
    # get the reports
    for i in range(number_of_stations):
        try:
            report_url = 'http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=' + str(stationIDs[i]) + '&year1=1950&year2=2017&format=csv'
            #print(report_url)
            original_report_name = './data/wi-lakes/report_'+str(stationIDs[i])+'.csv'

            if os.path.isfile(original_report_name):
                continue
            else:
                urllib.request.urlretrieve(report_url, original_report_name)  

            # clean report 
            f = open(original_report_name,'r')
            lines = f.readlines()
            f.close()
            goodlines = []
            start = 0
            for n in range(len(lines)):
                if "Chlorophyll" in lines[n]:
                    start = n
                if start > 0:
                    if lines[n].strip().replace(",","") == "":
                        goodlines = lines[start:n]
                        break


            # get the water body ID which corresponds with satellite paths 
            wbic = lines[4].split(',')[2]

            # get rid of unnecessary columns 
            cleanlines = []
            for l in range(len(goodlines)):
                cleanlines.append(",".join(goodlines[l].split(',')[:10])+'\n')

            #print("\n".join(cleanlines))

            # save clean report 
            clean_report_name = './data/wi-lakes/report_'+str(stationIDs[i])+'_clean.csv'   
            f = open(clean_report_name,'w')      
            f.writelines(cleanlines)
            f.close()
            
        except:
            print('There is an issue with Station with ID ' + str(stationIDs[i]))

    

In [82]:
clean_reports(len(stationIDs))

In [314]:
wi_lakes = []
def reports_to_df(number_of_stations):
    for i in range(number_of_stations):
        try:
            clean_report_name = './data/wi-lakes/wi_lakes_reports_05_22_2018/report_'+str(stationIDs[i])+'_clean.csv'   
            original_report_name = './data/wi-lakes/wi_lakes_reports_05_22_2018/report_'+str(stationIDs[i])+'.csv'

            f = open(original_report_name,'r')
            lines = f.readlines()
            wbic = str(lines[4].split(',')[2])
            lake_name = lines[4].split(',')[0]
            county = lines[4].split(',')[1]

            df = pd.read_csv(clean_report_name,sep=",")
            # clean column names
            df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '_').str.replace(')', '').str.replace('/', '_').str.replace('?', '')
            df['wbic'] = wbic
            df['lake_name'] = lake_name
            df['county'] = county
            df['station_id'] = stationIDs[i]

            wi_lakes.append(df)
        except: 
            print('There is no report for Station with ID ' + str(stationIDs[i]))

   # wi_lakes_df = pd.concat(wi_lakes, ignore_index = True)
    #return wi_lakes_df
    #wi_lakes_df.to_csv('./data/wi-lakes/wi_lakes_all.csv',index=False)
        

In [315]:
reports_to_df(len(stationIDs))

There is no report for Station with ID 253124
There is no report for Station with ID 553072
There is no report for Station with ID 10049493
There is no report for Station with ID 553071
There is no report for Station with ID 043126
There is no report for Station with ID 073096
There is no report for Station with ID 343120
There is no report for Station with ID 643320
There is no report for Station with ID 143123
There is no report for Station with ID 493081
There is no report for Station with ID 443043
There is no report for Station with ID 553069
There is no report for Station with ID 10007674
There is no report for Station with ID 10047299
There is no report for Station with ID 013038
There is no report for Station with ID 643326
There is no report for Station with ID 10031658
There is no report for Station with ID 553070
There is no report for Station with ID 043087
There is no report for Station with ID 643095
There is no report for Station with ID 443453
There is no report for Sta

In [316]:
#pd.read_csv('./data/wi-lakes/wi_lakes_all.csv').dtypes

### Obtain the path and coordinates by water body IDs


Paths by county: https://dnr.wi.gov/lakes/CLMN/remotesensing/paths.aspx?county=Wood

In [362]:
counties

['Adams',
 'Ashland',
 'Barron',
 'Bayfield',
 'Brown',
 'Buffalo',
 'Burnett',
 'Calumet',
 'Chippewa',
 'Clark',
 'Columbia',
 'Crawford',
 'Dane',
 'Dodge',
 'Door',
 'Douglas',
 'Dunn',
 'Eau Claire',
 'Florence',
 'Fond Du Lac',
 'Forest',
 'Grant',
 'Green',
 'Green Lake',
 'Iowa',
 'Iron',
 'Jackson',
 'Jefferson',
 'Juneau',
 'Kenosha',
 'Kewaunee',
 'La Crosse',
 'Lafayette',
 'Langlade',
 'Lincoln',
 'Manitowoc',
 'Marathon',
 'Marinette',
 'Marquette',
 'Menominee',
 'Milwaukee',
 'Monroe',
 'Oconto',
 'Oneida',
 'Outagamie',
 'Ozaukee',
 'Pepin',
 'Pierce',
 'Polk',
 'Portage',
 'Price',
 'Racine',
 'Richland',
 'Rock',
 'Rusk',
 'St. Croix',
 'Sauk',
 'Sawyer',
 'Shawano',
 'Sheboygan',
 'Taylor',
 'Trempealeau',
 'Vernon',
 'Vilas',
 'Walworth',
 'Washburn',
 'Washington',
 'Waukesha',
 'Waupaca',
 'Waushara',
 'Winnebago',
 'Wood']

In [363]:
# For GEE, need a list of [longitude, Latitude]
lat = []
lon = []
wbic = []

def clean_coord(raw_coord):
    '''
    Turn ill-formated coordinates in str format (e.g. '44 1 48.99') to the right format
    '''
    a=raw_coord.split()
    coord = a[0]+'.'+a[1]+a[2].replace('.','')
    return coord
    
def coord_extract():
    '''
    extract coordinates 
    '''
    
    for i in range(len(counties)): 
        try: 
            wbic_url = 'https://dnr.wi.gov/lakes/CLMN/remotesensing/paths.aspx?county='+counties[i]
            wbic_html = requests.get(wbic_url)
            wbic_soup = bs(wbic_html.text, "html.parser")
            wbic_table = wbic_soup.find('table',  { "class" : "greysuitsyou" })
            wbic_twod_array = parse.make2d(wbic_table)
            #print(wbic_twod_array)
            for n in range(1,len(wbic_twod_array)):
                wbic.append(wbic_twod_array[n][1])
                lat.append(wbic_twod_array[n][2])
                lon.append(wbic_twod_array[n][3])
        except:
            print('There is an issue with ' + str(counties[i]))

In [364]:
coord_extract()

In [366]:
len(wbic)

6997

In [367]:
wbic

['176000',
 '1374300',
 '1377700',
 '1374800',
 '176600',
 '102600',
 '102800',
 '1343600',
 '1301300',
 '103400',
 '1352000',
 '103600',
 '178300',
 '104000',
 '1378100',
 '1377900',
 '178500',
 '179100',
 '106500',
 '106600',
 '178700',
 '107000',
 '1017300',
 '1377400',
 '109900',
 '110100',
 '110300',
 '111300',
 '177200',
 '1300400',
 '1300600',
 '1343100',
 '1351300',
 '1351600',
 '1376400',
 '1376600',
 '117500',
 '2754000',
 '2410400',
 '2892100',
 '2755100',
 '2403200',
 '1834500',
 '2916200',
 '2935400',
 '2916700',
 '2751600',
 '2752000',
 '2407900',
 '1838600',
 '1838700',
 '2938000',
 '2932700',
 '2430100',
 '1842600',
 '1842900',
 '2430300',
 '2428700',
 '2760600',
 '2936800',
 '2429000',
 '2935800',
 '2914800',
 '2935600',
 '1850200',
 '2406500',
 '2291800',
 '2892400',
 '2917300',
 '2936400',
 '2935500',
 '2765900',
 '2915800',
 '2285900',
 '1861100',
 '2767000',
 '2934800',
 '2767300',
 '2767500',
 '1864300',
 '1865200',
 '2934000',
 '2935300',
 '1866000',
 '2916900',


In [368]:
#print("\n".join(lon))
lon

['-89 36 44.14',
 '-89 53 7.15',
 '-89 51 19.55',
 '-89 50 7.74',
 '-89 38 30.2',
 '-89 36 23.08',
 '-89 38 47.42',
 '-89 48 16.03',
 '-89 48 53.34',
 '-89 35 59.56',
 '-89 48 54.59',
 '-89 38 10.44',
 '-89 36 40.3',
 '-89 38 47.23',
 '-89 46 31.81',
 '-89 48 21.11',
 '-89 37 34.69',
 '-89 38 43.05',
 '-89 38 12.02',
 '-89 39 36.08',
 '-89 38 29.08',
 '-89 36 17.38',
 '',
 '-89 53 26.5',
 '-89 37 35.59',
 '-89 37 35.81',
 '-89 39 58.35',
 '-89 37 31.63',
 '-89 39 22.9',
 '-89 47 6.28',
 '-89 46 30.22',
 '-89 50 59.37',
 '-89 50 6.42',
 '-89 46 31.08',
 '-90 1 4.19',
 '-89 59 35.75',
 '-89 37 16.37',
 '-90 41 0.87',
 '-90 27 35.91',
 '-90 38 48.3',
 '-90 47 7.73',
 '-90 48 50.31',
 '-90 55 17.86',
 '-90 53 55.82',
 '-90 37 31.17',
 '-90 48 21.69',
 '-90 40 35.75',
 '-90 35 14.32',
 '-90 34 45.32',
 '-90 26 39.84',
 '-90 36 20.33',
 '-90 33 34.18',
 '-90 42 10.71',
 '-90 52 57.96',
 '-90 49 5.17',
 '-90 25 45.76',
 '-90 54 46.95',
 '-90 51 31.8',
 '-90 36 20.14',
 '-90 34 30.58',
 '-90 5

In [369]:
lat

['43 39 5.38',
 '44 1 48.99',
 '44 12 41.27',
 '44 3 19.91',
 '43 40 11.14',
 '43 46 19.95',
 '43 46 46.7',
 '43 50 17.34',
 '43 43 21.03',
 '43 51 48.13',
 '43 58 33.01',
 '43 47 51.63',
 '43 47 25.24',
 '43 44 36.44',
 '44 12 16.3',
 '44 12 15.53',
 '43 47 25.41',
 '43 50 30.04',
 '43 46 33.56',
 '43 53 6.81',
 '43 47 12.68',
 '43 53 6.53',
 '',
 '44 12 2.76',
 '43 46 20.3',
 '43 45 15.52',
 '43 44 36.51',
 '43 49 24.43',
 '43 44 23.38',
 '43 42 28.98',
 '43 42 29.33',
 '43 52 28.14',
 '43 59 25.91',
 '44 1 23.93',
 '44 3 6.58',
 '44 3 20.34',
 '43 47 38.33',
 '46 35 45.71',
 '46 10 27.52',
 '46 38 9',
 '46 21 42.97',
 '45 59 36.65',
 '46 9 23.24',
 '46 18 12.35',
 '46 16 57.88',
 '46 19 45.79',
 '46 48 55.04',
 '46 51 8.94',
 '46 3 12.01',
 '46 10 1.21',
 '46 3 14.4',
 '46 16 6.56',
 '46 16 20.28',
 '46 9 49.26',
 '45 59 10.33',
 '46 5 14.84',
 '46 10 43.23',
 '46 9 9.52',
 '46 16 31.84',
 '46 13 18.22',
 '46 11 8.53',
 '46 16 57.94',
 '46 18 55.54',
 '46 17 10.15',
 '46 5 58.14',
 

In [370]:
len(lat)

6997

In [371]:
coord_all  = []

def coord_list():
    for c in range(len(lat)):
        try:
            longitude = clean_coord(lon[c])
        except:
            longtitude = 'NaN'
        try:
            latitude = clean_coord(lat[c])
        except:
            latitude = 'NaN'

        coordinates = longitude+','+latitude

        coordinates_list = []
        coordinates_list.append(float(coordinates.split(',')[0]))
        coordinates_list.append(float(coordinates.split(',')[1]))

        coord_all.append(coordinates_list)
    return coord_all

In [372]:
coord_list()

[[-89.364414, 43.39538],
 [-89.53715, 44.14899],
 [-89.511955, 44.124127],
 [-89.50774, 44.31991],
 [-89.38302, 43.401114],
 [-89.362308, 43.461995],
 [-89.384742, 43.46467],
 [-89.481603, 43.501734],
 [-89.485334, 43.432103],
 [-89.355956, 43.514813],
 [-89.485459, 43.583301],
 [-89.381044, 43.475163],
 [-89.36403, 43.472524],
 [-89.384723, 43.443644],
 [-89.463181, 44.12163],
 [-89.482111, 44.121553],
 [-89.373469, 43.472541],
 [-89.384305, 43.503004],
 [-89.381202, 43.463356],
 [-89.393608, 43.53681],
 [-89.382908, 43.471268],
 [-89.361738, 43.53653],
 [-89.361738, nan],
 [-89.53265, 44.12276],
 [-89.373559, 43.46203],
 [-89.373581, 43.451552],
 [-89.395835, 43.443651],
 [-89.373163, 43.492443],
 [-89.39229, 43.442338],
 [-89.47628, 43.422898],
 [-89.463022, 43.422933],
 [-89.505937, 43.522814],
 [-89.50642, 43.592591],
 [-89.463108, 44.12393],
 [-90.1419, 44.3658],
 [-89.593575, 44.32034],
 [-89.371637, 43.473833],
 [-90.41087, 46.354571],
 [-90.273591, 46.102752],
 [-90.38483, 46.

In [373]:
len(coord_all)

6997

In [374]:
coord_dict = dict(zip(wbic, coord_all))
coord_dict

{'176000': [-89.364414, 43.39538],
 '1374300': [-89.53715, 44.14899],
 '1377700': [-89.511955, 44.124127],
 '1374800': [-89.50774, 44.31991],
 '176600': [-89.38302, 43.401114],
 '102600': [-89.362308, 43.461995],
 '102800': [-89.384742, 43.46467],
 '1343600': [-89.481603, 43.501734],
 '1301300': [-89.485334, 43.432103],
 '103400': [-89.355956, 43.514813],
 '1352000': [-89.485459, 43.583301],
 '103600': [-89.381044, 43.475163],
 '178300': [-89.36403, 43.472524],
 '104000': [-89.384723, 43.443644],
 '1378100': [-89.463181, 44.12163],
 '1377900': [-89.482111, 44.121553],
 '178500': [-89.373469, 43.472541],
 '179100': [-89.384305, 43.503004],
 '106500': [-89.381202, 43.463356],
 '106600': [-89.393608, 43.53681],
 '178700': [-89.382908, 43.471268],
 '107000': [-89.361738, 43.53653],
 '1017300': [-89.361738, nan],
 '1377400': [-89.53265, 44.12276],
 '109900': [-89.373559, 43.46203],
 '110100': [-89.373581, 43.451552],
 '110300': [-89.395835, 43.443651],
 '111300': [-89.373163, 43.492443],
 '

In [375]:
wi_lakes_df = pd.concat(wi_lakes, ignore_index = True)

In [376]:
wi_lakes_df

Unnamed: 0,group_seq_no,start_date,secchi__feet,secchi_hit_bottom,secchi__meters,chlorophyll_ug_l,total_phosphorus_ug_l,secchi_tsi,total_phosphorus_tsi,chlorophyll_tsi,wbic,lake_name,county,station_id
0,1,08/07/1979,,,,3.04,11.0,,47.0,43.0,267800,Adams Lake,Portage,504001
1,81853857,06/19/2013,25.00,NO,7.60,,,31.0,,,267800,Adams Lake,Portage,504001
2,1,07/17/2017,,,,3.38,26.3,,53.0,44.0,267800,Adams Lake,Portage,504001
3,18138029,07/17/2017,6.00,NO,1.80,,,51.0,,,267800,Adams Lake,Portage,504001
4,18138029,08/07/2017,7.75,NO,2.40,,,48.0,,,267800,Adams Lake,Portage,504001
5,1,08/08/2017,,,,2.99,22.8,,52.0,43.0,267800,Adams Lake,Portage,504001
6,18138029,09/13/2017,7.75,NO,2.40,5.88,26.2,48.0,53.0,48.0,267800,Adams Lake,Portage,504001
7,1,08/16/1979,,,,,,,,,967400,Aldridge Lake,Oneida,443052
8,7000068,08/29/2001,,,,5.80,,,,48.0,967400,Aldridge Lake,Oneida,443052
9,8188599,08/29/2001,3.00,NO,0.90,,,61.0,,,967400,Aldridge Lake,Oneida,443052


In [377]:
wi_lakes_df['coordinates'] = wi_lakes_df['wbic'].map(coord_dict)

In [379]:
wi_lakes_df.coordinates.isnull().sum()

1034

In [381]:
# some in-situ data have no corresponding water body id/coordinates 
wi_lakes_df[wi_lakes_df.coordinates.isnull()]

Unnamed: 0,group_seq_no,start_date,secchi__feet,secchi_hit_bottom,secchi__meters,chlorophyll_ug_l,total_phosphorus_ug_l,secchi_tsi,total_phosphorus_tsi,chlorophyll_tsi,wbic,lake_name,county,station_id,coordinates
6367,1,07/10/1973,,,,,70.0,,61.0,,2881200,Bark Bay Slough,Bayfield,043122,
6368,1,01/30/1974,,,,,20.0,,51.0,,2881200,Bark Bay Slough,Bayfield,043122,
6369,1,05/14/1974,,,,,10.0,,46.0,,2881200,Bark Bay Slough,Bayfield,043122,
6370,1,08/13/1974,,,,,30.0,,54.0,,2881200,Bark Bay Slough,Bayfield,043122,
6371,1,10/24/1974,,,,,110.0,,65.0,,2881200,Bark Bay Slough,Bayfield,043122,
6372,7000102,09/16/2004,5.90,NO,1.8,3.77,,52.0,,45.0,2881200,Bark Bay Slough,Bayfield,043122,
6373,130962896,08/18/2016,1.50,N,0.5,8.23,32.8,71.0,55.0,51.0,2881200,Bark Bay Slough,Bayfield,043122,
8459,8190506,07/21/1990,5.00,NO,1.5,,,54.0,,,5551281,Bass Lake,Waupaca,693104,
8460,8190506,08/26/1990,8.25,NO,2.5,,,47.0,,,5551281,Bass Lake,Waupaca,693104,
8461,8190506,09/22/1990,8.50,NO,2.6,,,46.0,,,5551281,Bass Lake,Waupaca,693104,


In [None]:
coordinates = []
wbic_valid = []
wbic_failed = []

# GEE takes Lon, Lat

def lake_info_extract():
    '''
    extract information about lakes
    '''
    
    for i in range(len(wbic)): 
        try:
        
        
            wbic_url = 'https://dnr.wi.gov/lakes/lakepages/LakeDetail.aspx?wbic='+wbic[i]+'&page=facts'
            wbic_html = requests.get(wbic_url)
            wbic_soup = bs(wbic_html.text, "html.parser")
            wbic_table = wbic_soup.find('table',  { "class" : "tableLeft" })
            wbic_twod_array = parse.make2d(wbic_table)


            if 'Latitude, Longitude' in wbic_twod_array[9]:
                #print (str(wbic[i])+" has coordinates in line 9")
                #pp.pprint(wbic_twod_array)
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[9][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[1]))
                coord.append(float(coord_string[0]))
                coordinates.append(coord)
                wbic_valid.append(wbic[i])

            if 'Latitude, Longitude' in wbic_twod_array[10]: 
                #print (str(wbic[i])+" has coordinates in line 10")
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[10][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[1]))
                coord.append(float(coord_string[0]))
                coordinates.append(coord)
                wbic_valid.append(wbic[i])

            if 'Latitude, Longitude' in wbic_twod_array[11]:
                #print (str(wbic[i])+" has coordinates in line 11")
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[11][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[1]))
                coord.append(float(coord_string[0]))
                coordinates.append(coord)
                wbic_valid.append(wbic_twod_array[2][1])
        except:
            print('There is an issue with water body ' + str(wbic[i]))
            wbic_failed.append(wbic[i])


### extras

In [None]:
'''
            ## if file exists, delete it ##
            if os.path.isfile(original_report_name):
                os.remove(original_report_name)
            else:    ## Show an error ##
                print("Error: %s file not found" % original_report_name)

            ## if file exists, delete it ##
            if os.path.isfile(clean_report_name):
                os.remove(clean_report_name)
            else:    ## Show an error ##
                print("Error: %s file not found" % clean_report_name)
            '''

In [None]:
# get rid of messy content 
cleanlines=[]
for l in range(len(goodlines)):
    cleanlines.append(goodlines[l].replace('1-Beautiful, could not be nicer', ' ')

In [None]:
from html_table_parser import parser_functions as parse

station_url = 'https://dnr.wi.gov/lakes/clmn/Stations.aspx?location=0'
station_html = requests.get(station_url)

station_soup = bs(station_html.text, "html.parser")
station_table = station_soup.find('table',  { "class" : "greysuitsyou" })
station_twod_array = parse.make2d(station_table)

# print 2D array
print(len(station_twod_array))

stations=[]
for i in range(2,len(station_twod_array)-1):
    stations.append(station_twod_array[i][1])

print(stations)

In [None]:
# get station ID
stationIDs = []
def access_chla_reports():
    for i in range(0,len(counties)):
        station_url = 'https://dnr.wi.gov/lakes/clmn/Stations.aspx?location='+str(i+1)
        #print(station_url)
        station_html = requests.get(station_url)
        station_soup = bs(station_html.text, "html.parser")
        station_table = station_soup.find('table',  { "class" : "greysuitsyou" })
        if station_table == None:
            print('location '+str(i+1) + ' ' + counties[i] + ' has no lakes.')
        else:
            station_twod_array = parse.make2d(station_table)
            for n in range(2,len(station_twod_array)-1):
                stationIDs.append(station_twod_array[n][1])
            #print('location '+str(i+1) + ' ' + counties[i] + ' has ' + str(len(station_twod_array)) + ' stations.')

In [None]:
# Get the reports

report_url = 'http://dnrx.wisconsin.gov/swims/public/reporting.do?type=58&action=post&stationNo=' + \
             str(stationIDs[1]) + '&year1=1950&year2=2017&format=csv'
print(report_url)


original_report_name = './data/wi-lakes/report_'+str(stationIDs[1])+'.csv'
urllib.request.urlretrieve(report_url, original_report_name)  

In [None]:
# Clean report file

f = open(original_report_name,'r')
lines = f.readlines()
f.close()
goodlines = []
start = 0
for i in range(len(lines)):
    if "Chlorophyll" in lines[i]:
        start = i
    if start > 0:
        if lines[i].strip().replace(",","") == "":
            goodlines = lines[start:i]
            break
            
            
cleanlines=[]
for i in range(len(goodlines)):
    cleanlines.append(goodlines[i].replace('1-Beautiful, could not be nicer', '1-beautiful'))
print("\n".join(cleanlines))

clean_report_name = './data/wi-lakes/report_'+str(stationIDs[1])+'_clean.csv'   

f = open(clean_report_name,'w')      
f.writelines(cleanlines)
f.close()

In [None]:
df = pd.read_csv(clean_report_name,sep=",")

# clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '_').str.replace(')', '')
df['wbic'] = wbic
df['station_id'] = stationIDs[1]
df.head()

In [None]:
coordinates = []
area_acre = []
depth_max_ft = []
depth_mean_ft = []
trophic_status = []
water_body_type = []
hydrologic_type = []
wbic_valid = []

def lake_info_extract():
    '''
    extract information about lakes
    '''
    
    for i in range(0,10): 
        
        try:
                
            wbic_url = 'https://dnr.wi.gov/lakes/lakepages/LakeDetail.aspx?wbic='+wbic[i]+'&page=facts'
            wbic_html = requests.get(wbic_url)
            wbic_soup = bs(wbic_html.text, "html.parser")
            wbic_table = wbic_soup.find('table',  { "class" : "tableLeft" })
            wbic_twod_array = parse.make2d(wbic_table)


            if 'Latitude, Longitude' in wbic_twod_array[9]:
                print (str(wbic[i])+" has coordinates in line 9")
                #pp.pprint(wbic_twod_array)
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[9][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[0]))
                coord.append(float(coord_string[1]))

                coordinates.append(coord)
                wbic_valid.append(wbic_twod_array[2][1])
                area_acre.append(wbic_twod_array[3][1].lower().replace('\r\n                    acres',''))
                depth_max_ft.append(float(wbic_twod_array[4][1].lower().replace(' feet','')))
                depth_mean_ft.append('NaN')
                trophic_status.append(wbic_twod_array[18][1].lower())
                water_body_type.append(wbic_twod_array[5][1].lower())
                hydrologic_type.append(wbic_twod_array[6][1].lower())

            if 'Latitude, Longitude' in wbic_twod_array[10]: 
                print (str(wbic[i])+" has coordinates in line 10")
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[10][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[0]))
                coord.append(float(coord_string[1]))
                #pp.pprint(wbic_twod_array)
                
                coordinates.append(coord)
                wbic_valid.append(wbic_twod_array[2][1])
                area_acre.append(wbic_twod_array[3][1].replace('\r\n                    ACRES',''))
                depth_max_ft.append(float(wbic_twod_array[4][1].lower().replace(' feet','')))
                depth_mean_ft.append('NaN')
                trophic_status.append(wbic_twod_array[20][1].lower())
                water_body_type.append(wbic_twod_array[6][1].lower())
                hydrologic_type.append(wbic_twod_array[7][1].lower())

            if 'Latitude, Longitude' in wbic_twod_array[11]:
                print (str(wbic[i])+" has coordinates in line 11")
                # turn coordinates into list of floats
                coord = []
                coord_string = wbic_twod_array[11][1].replace('\r\n                    ','').split(',')
                coord.append(float(coord_string[0]))
                coord.append(float(coord_string[1]))
                #pp.pprint(wbic_twod_array)
                
                coordinates.append(coord)
                wbic_valid.append(wbic_twod_array[2][1])
                area_acre.append(wbic_twod_array[3][1].replace('\r\n                    ACRES',''))
                depth_max_ft.append(float(wbic_twod_array[4][1].lower().replace(' feet','')))
                depth_mean_ft.append(float(wbic_twod_array[5][1].lower().replace(' feet','')))
                trophic_status.append(wbic_twod_array[22][1].lower())
                water_body_type.append(wbic_twod_array[7][1].lower())
                hydrologic_type.append(wbic_twod_array[8][1].lower())
                
        except:
            print('There is an issue with water body ' + str(wbic[i]))
            