### Additional Weather Data Collection ###

A huge thanks to [Iowa Environmental Mesonet](https://mesonet.agron.iastate.edu/request/download.phtml?network=IL_ASOS#), especially Daryl Herzmann, for providing a solid archive and [script](https://github.com/akrherz/iem/blob/master/scripts/asos/iem_scraper_example.py) to obtain the data.

Approximate Runtime: 10 minutes

In [1]:
"""
Example script that scrapes data from the IEM ASOS download service
"""
from __future__ import print_function
import json
import time
import datetime
import pandas as pd
import numpy as np
# Python 2 and 3: alternative 4
try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen

# Number of attempts to download data
MAX_ATTEMPTS = 6
# HTTPS here can be problematic for installs that don't have Lets Encrypt CA
SERVICE = "http://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"

In [2]:
def download_data(uri):
    """Fetch the data from the IEM
    The IEM download service has some protections in place to keep the number
    of inbound requests in check.  This function implements an exponential
    backoff to keep individual downloads from erroring.
    Args:
      uri (string): URL to fetch
    Returns:
      string data
    """
    attempt = 0
    while attempt < MAX_ATTEMPTS:
        try:
            data = urlopen(uri, timeout=300).read().decode('utf-8')
            if data is not None and not data.startswith('ERROR'):
                return data
        except Exception as exp:
            print("download_data(%s) failed with %s" % (uri, exp))
            time.sleep(5)
        attempt += 1

    print("Exhausted attempts to download, returning empty data")
    return ""


def get_stations_from_filelist(filename):
    """Build a listing of stations from a simple file listing the stations.
    The file should simply have one station per line.
    """
    stations = []
    for line in open(filename):
        stations.append(line.strip())
    return stations

In [3]:
# timestamps in UTC to request data for
startts = datetime.datetime(2017, 1, 1)
endts = datetime.datetime(2019, 8, 27)

service = SERVICE + "&data=tmpf&data=dwpf&data=relh&data=feel&data=drct&data=sped&data=alti&data=mslp&data=p01i&data=vsby&data=gust_mph&" + \
                    "tz=Etc%2FUTC&format=onlycomma&latlon=no&missing=null&trace=null&direct=no&report_type=2&"

service += startts.strftime('year1=%Y&month1=%m&day1=%d&')
service += endts.strftime('year2=%Y&month2=%m&day2=%d&')

# Two examples of how to specify a list of stations
#stations = get_stations_from_filelist("airport_list.txt")
stations = ['CAK', 'AKC', 'ALB', 'ABQ', 'ANC', 'ATW', 'ATL', 'ACY', 'AUS', 'BWI', 'BGR', 'BHM', 'BOI', 'BOS', 'BUF', 'CLT', \
            'CHS', 'MDW', 'ORD', 'CVG', 'CLE', 'CMH', 'LCK', 'DFW', 'DAY', 'DEN', 'DSM', 'DTW', 'FAI', 'FLL', 'RSW', 'FAT', \
            'BDL', 'GRB', 'GSO', 'ITO', 'HNL', 'IAH', 'HOU', 'HSV', 'IND', 'JAX', 'JNU', 'MCI', 'KTN', 'EYW', 'KOA', 'TYS', \
            'LAL', 'LAN', 'LAS', 'LIT', 'LAX', 'SDF', 'MLB', 'MEM', 'MIA', 'MAF', 'MKE', 'MSP', 'MYR', 'BNA', 'MSY', 'JFK', \
            'LGA', 'EWR', 'SWF', 'ORF', 'OAK', 'OKC', 'OMA', 'ONT', 'SNA', 'MCO', 'SFB', 'PSP', 'ECP', 'PNS', 'PHL', 'PHX', \
            'AZA', 'PIT', 'PWM', 'PDX', 'PVD', 'RDU', 'RNO', 'RIC', 'RST', 'ROC', 'RFD', 'SMF', 'SLC', 'SAT', 'SBD', 'SAN', \
            'SFO', 'SJC', 'SRQ', 'SAV', 'LKE', 'BFI', 'SEA', 'PAE', 'GEG', 'STL', 'PIE', 'SYR', 'TLH', 'TPA', 'DCA', 'IAD', \
            'PBI', 'AVP', 'ILM']
stations[0] = ''.join([c for c in stations[0] if ord(c) < 128])
stations.sort()

weather_df = pd.DataFrame()
for station in stations:
    uri = '%s&station=%s' % (service, station)
    print('Downloading: %s' % (station, ))
    data = download_data(uri).split('\n')
    data[:] = [list(d.split(',')) for d in data]
    if len(data) > 2:
        weather_df = weather_df.append(pd.DataFrame(data[1:], columns=data[0]))

# Uncomment the code below if you'd like to store the raw data within a file
#     out = open('weather_full_unaggregated.csv', 'a')
#     out.write(data)
#     out.close()
    print('Finished: %s' % (station, ))

Downloading: ABQ
Finished: ABQ
Downloading: ACY
Finished: ACY
Downloading: AKC
Finished: AKC
Downloading: ALB
Finished: ALB
Downloading: ANC
Finished: ANC
Downloading: ATL
Finished: ATL
Downloading: ATW
Finished: ATW
Downloading: AUS
Finished: AUS
Downloading: AVP
Finished: AVP
Downloading: AZA
Finished: AZA
Downloading: BDL
Finished: BDL
Downloading: BFI
Finished: BFI
Downloading: BGR
Finished: BGR
Downloading: BHM
Finished: BHM
Downloading: BNA
Finished: BNA
Downloading: BOI
Finished: BOI
Downloading: BOS
Finished: BOS
Downloading: BUF
Finished: BUF
Downloading: BWI
Finished: BWI
Downloading: CAK
Finished: CAK
Downloading: CHS
Finished: CHS
Downloading: CLE
Finished: CLE
Downloading: CLT
Finished: CLT
Downloading: CMH
Finished: CMH
Downloading: CVG
Finished: CVG
Downloading: DAY
Finished: DAY
Downloading: DCA
Finished: DCA
Downloading: DEN
Finished: DEN
Downloading: DFW
Finished: DFW
Downloading: DSM
Finished: DSM
Downloading: DTW
Finished: DTW
Downloading: ECP
Finished: ECP
Download

In [4]:
# If you saved the data within a csv, load it in using this line
# weather = pd.read_csv('weather_full_unaggregated.csv')

# Otherwise:
weather = weather_df[weather_df.station != 'station']
weather.dropna(how='any', inplace=True)
weather

Unnamed: 0,station,valid,tmpf,dwpf,relh,feel,drct,sped,alti,mslp,p01i,vsby,gust_mph
0,ABQ,2017-01-01 00:52,42.10,37.90,84.93,35.63,180.00,11.50,29.87,1011.00,0.00,10.00,
1,ABQ,2017-01-01 01:52,39.90,37.90,92.48,33.84,160.00,9.20,29.89,1011.60,0.00,10.00,
2,ABQ,2017-01-01 02:52,39.00,36.00,88.87,35.59,120.00,4.60,29.89,1012.10,0.00,10.00,
3,ABQ,2017-01-01 03:52,37.00,35.10,92.76,34.40,170.00,3.45,29.89,1012.50,0.00,10.00,
4,ABQ,2017-01-01 04:52,37.00,35.10,92.76,30.27,190.00,9.20,29.89,1011.90,0.00,10.00,
5,ABQ,2017-01-01 05:52,37.90,35.10,89.53,32.64,160.00,6.90,29.90,1012.20,0.00,10.00,
6,ABQ,2017-01-01 06:52,37.00,35.10,92.76,33.26,180.00,4.60,29.89,1011.00,0.00,10.00,
7,ABQ,2017-01-01 07:52,37.00,35.10,92.76,32.33,130.00,5.75,29.88,1010.30,0.00,10.00,
8,ABQ,2017-01-01 08:52,37.90,35.10,89.53,33.40,170.00,5.75,29.88,1010.00,0.00,10.00,
9,ABQ,2017-01-01 09:52,36.00,35.10,96.49,30.35,130.00,6.90,29.88,1010.20,0.00,10.00,


In [5]:
weather = weather.rename(columns = {'station':'Airport', 'valid':'Date', 'tmpf':'Temperature', 'feel': 'Apparent Temperature', \
                          'dwpf':'Dew Point Temp', 'relh':'Relative Humidity %', 'drct':'Wind Direction (degrees from N)', \
                          'sped':'Wind Speed', 'p01i':'One Hour Precipitation', 'alti':'Pressure Altimeter', 'mslp':'Sea Level Pressure', \
                          'vsby':'Visibility', 'gust_mph':'Gust'})
weather['Date'] = weather['Date'].astype('datetime64[ns]')
weather.head()

Unnamed: 0,Airport,Date,Temperature,Dew Point Temp,Relative Humidity %,Apparent Temperature,Wind Direction (degrees from N),Wind Speed,Pressure Altimeter,Sea Level Pressure,One Hour Precipitation,Visibility,Gust
0,ABQ,2017-01-01 00:52:00,42.1,37.9,84.93,35.63,180.0,11.5,29.87,1011.0,0.0,10.0,
1,ABQ,2017-01-01 01:52:00,39.9,37.9,92.48,33.84,160.0,9.2,29.89,1011.6,0.0,10.0,
2,ABQ,2017-01-01 02:52:00,39.0,36.0,88.87,35.59,120.0,4.6,29.89,1012.1,0.0,10.0,
3,ABQ,2017-01-01 03:52:00,37.0,35.1,92.76,34.4,170.0,3.45,29.89,1012.5,0.0,10.0,
4,ABQ,2017-01-01 04:52:00,37.0,35.1,92.76,30.27,190.0,9.2,29.89,1011.9,0.0,10.0,


In [6]:
weather_a = weather[['Temperature', 'Dew Point Temp', 'Relative Humidity %', 'One Hour Precipitation', \
                              'Wind Speed', 'Pressure Altimeter', 'Sea Level Pressure']]
weather_a = weather_a.apply(pd.to_numeric, errors='coerce')
weather_a.insert(0, 'Airport', weather['Airport'])
weather_a.insert(1, 'Date', weather['Date'])

In [7]:
airports = stations
def aggregation(weather):

    weather_aggregated = pd.DataFrame()
    
    # Aggregate each airport by day
    for a in airports:
        each_airport = weather[weather['Airport'] == a]
        each_airport = each_airport.groupby([each_airport['Date'].dt.date]).mean().round(2)
        each_airport.insert(0, 'Airport', a)
        weather_aggregated = pd.concat([weather_aggregated, each_airport])
        
    return weather_aggregated

In [8]:
weather_aggregated = aggregation(weather_a)
weather_aggregated = weather_aggregated.rename(columns={'One Hour Precipitation': 'Average Precipitation'})
weather_aggregated.head()

Unnamed: 0_level_0,Airport,Temperature,Dew Point Temp,Relative Humidity %,Average Precipitation,Wind Speed,Pressure Altimeter,Sea Level Pressure
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01,ABQ,40.18,34.54,81.85,0.0,7.24,29.84,1009.46
2017-01-02,ABQ,37.2,33.07,86.5,0.0,4.6,29.92,1012.88
2017-01-03,ABQ,39.66,26.78,62.01,0.0,3.79,30.12,1019.37
2017-01-04,ABQ,42.87,26.41,54.85,0.0,6.81,30.06,1017.0
2017-01-05,ABQ,48.69,31.58,52.02,0.0,12.75,29.85,1007.69


In [9]:
weather_aggregated.to_csv('weather_full_aggregated.csv')