In [5]:
import datetime
from os import rename
from os.path import splitext
from calendar import month_name
from httplib import HTTPConnection
from time import clock
from urllib import urlretrieve
import urllib2
from zipfile import ZipFile
import numpy as np
from bs4 import BeautifulSoup

from data_scraper import *

import os
import errno
import humanize
import sys
import json
import pandas as pd

In [6]:
def parseHistoryTable(table):
    temperature = 0 # in degree F
    events = '' # some string i.e. rain
    humidity = 0 # humidity in %
    precipitation = 0 # in inch
    snowfall = 0 # in inch
    snowDepth = 0 # in inch
    sealevelpressure = 0 # in inch
    visibility = 0 # in miles
    windspeed = 0 # in miles per hour

    # skip the first entries of some fields...
    skipprec = True
    skipsnow = True
    skippres = True

    # Find all the <tr> tag pairs, skip the first one, then for each.
    for row in table.find_all('tr')[1:]:
        # Create a variable of all the <td> tag pairs in each <tr> tag pair,
        col = row.find_all('td')
        # only retrieve relevant columns
        if col[0].text == 'Mean Temperature':
            try:
                temperature = int(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                temperature = 9999
        if col[0].text == 'Average Humidity':
            try:
                humidity = int(col[1].text)
            except:
                humidity = -1
        if col[0].text == 'Precipitation':
            if not skipprec:
                try:
                    precipitation = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    precipitation = -1.
            skipprec = False
        if col[0].text == 'Snow':
            if not skipsnow:
                try:
                    snowfall = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    snowfall = 0.
            skipsnow = False
        if col[0].text == 'Snow Depth':
            try:
                snowDepth = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                snowDepth = 0.
        if col[0].text == 'Sea Level Pressure':
            if not skippres:
                try:
                    sealevelpressure = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    sealevelpressure = -1.
            skippres = False
        if col[0].text == 'Visibility':
            try:
                visibility = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                visibility = -1.
        if col[0].text == 'Wind Speed':
            try:
                windspeed = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                windspeed = -1.
        if col[0].text == 'Events':
            try:
                events = col[1].text
            except:
                events = '?'
    # return as dictionary
    d = dict(zip(['temperature', 'events', 'humidity', 'precipitation', 'snowfall', 'snowdepth', 'sealevelpressure', 'visibility', 'windspeed'], \
    [temperature, events, humidity, precipitation, snowfall, snowDepth, sealevelpressure, visibility, windspeed]))
    
    return d

def getWeather(year, month, day, airportcode):
    url = 'http://www.wunderground.com/cgi-bin/findweather/getForecast?airportorwmo=query&historytype=DailyHistory&backurl=%2Fhistory%2Findex.html&code={airportcode}&month={month}&day={day}&year={year}'
    response = urllib2.urlopen(url.format(year=year, day=day, month=month, airportcode=airportcode))
    html = response.read()
    
    soup = BeautifulSoup(html, "html.parser")
    
    table = soup.find("table", attrs={"id": "historyTable"})
    
    return parseHistoryTable(table)

In [7]:
year = 2015
day = 3
month = 10
airportcode = 'KOAK'

getWeather(2014, 1, 3, airportcode)

{'events': u'\n\t\xa0\n',
 'humidity': 60,
 'precipitation': 0.0,
 'sealevelpressure': 30.02,
 'snowdepth': 0,
 'snowfall': 0,
 'temperature': 52,
 'visibility': 10.0,
 'windspeed': 3.0}

In [8]:
# lazy load dictionary
weatherDict = {}
weatherFile = os.path.join('..', 'cache', 'weather_data.json')
if file_exists(weatherFile):
    # load current dict from json to cache results
    with open(weatherFile) as infile:
        weatherDict = json.load(infile)

# read airport list
dfairports = pd.read_csv(os.path.join('..', 'data', 'airports.csv'), header=None)

In [None]:
from datetime import timedelta, date

# take 2014
year = 2014

# iterate over all airports
pos = 1
for key, item in dfairports.iterrows():
    airport = item.values[0]
    
    print 'processing %s (%d/%d)...' % (airport, pos, dfairports.count())
    pos += 1
                                     
    start_date = date(year, 1, 1)
    end_date = date(year, 12, 31)
    d = start_date
    delta = datetime.timedelta(days=1)

    # iterate over one year to get the data from it
    rows = []
    date_keys = []
    
    # check if for airport data exists already
    if airport in weatherDict.keys():
        rows = weatherDict[airport]
        date_keys = [el['date'] for el in rows]
    
    while d <= end_date:
        key = '%04d%02d%02d' % (d.year, d.month, d.day)
        
        # data already requested? --> skip!
        if not key in date_keys:
            print 'GET %s' % key
            try:
                rows.append(dict([('data', getWeather(d.year, d.month, d.day, airport)), ('date', key)]))
            except:
                print 'error for %s' % key
        d += delta
    weatherDict[airport] = rows
    
    # save current JSON!
    with open(weatherFile, 'wb') as outfile:
        json.dump(weatherDict, outfile)
        
# save JSON!
with open(weatherFile, 'wb') as outfile:
    json.dump(weatherDict, outfile)

processing JFK (1/40)...
GET 20140101
GET 20140102
GET 20140103
GET 20140104
GET 20140105
GET 20140106
GET 20140107
GET 20140108
GET 20140109
GET 20140110
GET 20140111
GET 20140112
GET 20140113
GET 20140114
GET 20140115
GET 20140116
GET 20140117
GET 20140118
GET 20140119
GET 20140120
GET 20140121
GET 20140122
GET 20140123
GET 20140124
GET 20140125
GET 20140126
GET 20140127
GET 20140128
GET 20140129
GET 20140130
GET 20140131
GET 20140201
GET 20140202
GET 20140203
GET 20140204
GET 20140205
GET 20140206
GET 20140207
GET 20140208
GET 20140209
GET 20140210
GET 20140211
GET 20140212
GET 20140213
GET 20140214
GET 20140215
GET 20140216
GET 20140217
GET 20140218
GET 20140219
GET 20140220
GET 20140221
GET 20140222
GET 20140223
GET 20140224
GET 20140225
GET 20140226
GET 20140227
GET 20140228
GET 20140301
GET 20140302
GET 20140303
GET 20140304
GET 20140305
GET 20140306
GET 20140307
GET 20140308
GET 20140309
GET 20140310
GET 20140311
GET 20140312
GET 20140313
GET 20140314
GET 20140315
GET 20140316
