In [1]:
import pandas as pd
import numpy as np
import os
import requests
import datetime as dt
from config import noaa_token as token

In [2]:
#variables
base = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/'
header = {'token':token,
          'Content-Type':'application/json'}

state_iso = {'Illinois':'IL',
            'Indiana':'IA',
            'Michigan':'MI',
            'Minnesota':'MN',
            'New York':'NY',
            'Ohio':'OH',
            'Pennsylvania':'PA',
            'Wisconsin':'WI',}

lake_st = {'Superior':['MI','MN','WI'],
          'Michigan':['IA','IL','MI','WI'],
          'Huron':['MI'],
          'Erie':['IA','NY','OH','PA'],
          'Ontario':['NY']}

In [3]:
#get count
def get_c(json):
    return json['metadata']['resultset']['count']

def gather(endpt,load):
    initial = requests.get(base+endpt,headers=header,params=load).json()
    count = get_c(initial)
    json = initial['results']
    offset = 1000
    while offset < count:
        load['offset'] = offset
        json += requests.get(base+endpt,headers=header,params=load).json()['results']
        offset += 1000
    return pd.DataFrame(json)

In [4]:
#Return only rows in daterange
#currently dates are hardcoded - can we do this programatically
def time_range(df):
    df_min = df[df['mindate']<='1972-12-19']
    df_total = df_min[df_min['maxdate']>='2016-05-05']
    return df_total

In [5]:
location_ids = requests.get(base+'locations',headers=header,params={'locationcategoryid':'ST','limit':100})
location_ids.status_code

200

In [6]:
all_states = pd.DataFrame(location_ids.json()['results'])
all_states.head()

Unnamed: 0,mindate,maxdate,name,datacoverage,id
0,1888-02-01,2020-01-12,Alabama,1,FIPS:01
1,1893-09-01,2020-01-12,Alaska,1,FIPS:02
2,1867-08-01,2020-01-12,Arizona,1,FIPS:04
3,1871-07-01,2020-01-12,Arkansas,1,FIPS:05
4,1850-10-01,2020-01-12,California,1,FIPS:06


In [7]:
state_ids = {}
for state in state_iso.keys():
    state_ids[state] = all_states[all_states['name']==state]['id'].values[0]
state_ids

{'Illinois': 'FIPS:17',
 'Indiana': 'FIPS:18',
 'Michigan': 'FIPS:26',
 'Minnesota': 'FIPS:27',
 'New York': 'FIPS:36',
 'Ohio': 'FIPS:39',
 'Pennsylvania': 'FIPS:42',
 'Wisconsin': 'FIPS:55'}

In [8]:
state_data = pd.DataFrame()
for state,st_id in state_ids.items():
    data = gather('stations',{'locationid':st_id,'limit':'1000'})
    print(state + ': ' + str(len(data)))
    data['location_id_state'] = state_iso[state]
    state_data = state_data.append(data,ignore_index=True)
print('Total: '+str(len(state_data)))

Illinois: 2387
Indiana: 2030
Michigan: 1833
Minnesota: 1792
New York: 2313
Ohio: 1570
Pennsylvania: 2031
Wisconsin: 1431
Total: 15387


In [9]:
state_data = state_data.set_index('id')
state_data.head()

Unnamed: 0_level_0,elevation,mindate,maxdate,latitude,name,datacoverage,elevationUnit,longitude,location_id_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
COOP:110050,131.1,1990-10-01,2015-11-01,38.5411,"ALBERS 1 W, IL US",0.9967,METERS,-89.6289,IL
COOP:110055,161.5,1942-06-01,2006-04-01,38.3777,"ALBION, IL US",0.9518,METERS,-88.0569,IL
COOP:110072,222.5,1931-01-01,2015-11-01,41.1977,"ALEDO, IL US",0.9951,METERS,-90.7447,IL
COOP:110082,207.3,1948-07-01,2015-11-01,41.0579,"ALEXIS 1 SW, IL US",0.0964,METERS,-90.5654,IL
COOP:110137,132.6,1943-09-01,2015-11-01,38.86702,"ALTON MELVIN PRICE LOCK AND DAM, IL US",0.9389,METERS,-90.14886,IL


In [10]:
state_data['mindate'] = pd.to_datetime(state_data['mindate'])
state_data['maxdate'] = pd.to_datetime(state_data['maxdate'])

In [11]:
state_nonull = state_data[pd.notnull(state_data['elevation'])]

In [12]:
state_nonull.head()

Unnamed: 0_level_0,elevation,mindate,maxdate,latitude,name,datacoverage,elevationUnit,longitude,location_id_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
COOP:110050,131.1,1990-10-01,2015-11-01,38.5411,"ALBERS 1 W, IL US",0.9967,METERS,-89.6289,IL
COOP:110055,161.5,1942-06-01,2006-04-01,38.3777,"ALBION, IL US",0.9518,METERS,-88.0569,IL
COOP:110072,222.5,1931-01-01,2015-11-01,41.1977,"ALEDO, IL US",0.9951,METERS,-90.7447,IL
COOP:110082,207.3,1948-07-01,2015-11-01,41.0579,"ALEXIS 1 SW, IL US",0.0964,METERS,-90.5654,IL
COOP:110137,132.6,1943-09-01,2015-11-01,38.86702,"ALTON MELVIN PRICE LOCK AND DAM, IL US",0.9389,METERS,-90.14886,IL


In [13]:
state_nodups_id = np.unique(state_nonull.index, return_index=True)[1]
state_nodups = state_nonull.iloc[state_nodups_id]

In [14]:
len(state_nodups)

15109

In [15]:
coords_st = state_nodups[['elevation','latitude','longitude']]
coords_st.describe()

Unnamed: 0,elevation,latitude,longitude
count,15109.0,15109.0,15109.0
mean,264.811305,42.131709,-84.570233
std,114.776699,2.258785,6.098804
min,0.0,35.34472,-98.41306
25%,195.1,40.43333,-88.8533
50%,248.7,41.78333,-85.662348
75%,316.4,43.46667,-79.08334
max,1482.9,49.31833,-71.1375


In [16]:
lake_coords = pd.read_csv(os.path.join('..','data_files','clean_data','lake_stats.csv'))[['Lakes','Elevationa (m)','left-up', 'lef-down', 'right-down', 'right-up']]
lake_coords

Unnamed: 0,Lakes,Elevationa (m),left-up,lef-down,right-down,right-up
0,Superior,183,"(48.99057766, -92.24770595)","(46.43517183, -92.16469881)","(46.47459248, -84.32534402)","(48.93016539, -84.4071318)"
1,Michigan,176,"(46.22897845, -88.20521498)","(41.60750601, -88.19972181)","(41.64217639, -85.00819349)","(46.2305618, -85.00251263)"
2,Huron,176,"(46.40526359, -84.763874)","(43.07614816, -84.80171647)","(43.06979478, -79.87373863)","(46.09853892, -79.93355375)"
3,Erie,173,"(42.80865899, -83.43929641)","(41.34261326, -83.4881252)","(-78.92269619, -78.92269619)","(42.89423587, -78.83968905)"
4,Ontario,74,"(44.18597098, -79.75643777)","(43.20266513, -79.77230758)","(43.22835316, -76.18466177)","(44.19877154, -76.2444769)"


In [17]:
def lake_box(lake):
    idx = lake_coords.index[lake_coords['Lakes']==lake].tolist()[0]
    lat_ls = []
    lng_ls = []
    ele = 0
    for e in lake_coords.iloc[idx].transpose():
        if isinstance(e,str):
            if e[0] == '(':
                lat = e.strip('()').split(',')[0]
                lng = e.strip('()').split(',')[1]
                lat_ls.append(round(float(lat),5))
                lng_ls.append(round(float(lng),5))
        elif isinstance(e,int):
            ele_ls = e
    return lat_ls,lng_ls,ele

In [18]:
def check_coords(lake,station_id):
    lat,lng,ele = lake_box(lake)
    s_lat,s_lng,s_ele = state_nodups.loc[station_id][['latitude','longitude','elevation']]
    inside = False
    if (s_lng>=min(lng)):
        if (s_lng>=max(lng)+.0001):
            if (s_lat>=min(lat)):
                if (s_lat>=max(lat)+.0001):
                    if abs(s_ele-ele) <= 3:
                        inside = True
    return inside

In [19]:
state_inside = pd.DataFrame()
for index,row in state_nodups.iterrows():
    if check_coords('Superior',index):
        state_inside = state_inside.append(row)
len(state_inside)

0

In [20]:
lake_stations = {}
for key,val in lake_st.items():
    lake_stations[key] = state_nodups.index[state_nodups['location_id_state'].isin(val)].tolist()

In [21]:
for key,val in lake_stations.items():
    state_nonull.loc[val].to_csv(os.path.join('..','data_files','prep_data',f'{key}_stations.csv'))

In [22]:
state_nodups.to_csv(os.path.join('..','data_files','prep_data','stations.csv'))

In [23]:
len(state_nodups)

15109

In [24]:
state_convert = time_range(state_nodups)
len(state_convert)

1043