In [1]:
import pickle
import time
from collections import namedtuple
from datetime import datetime, timedelta

import pandas as pd
import requests

import matplotlib.pyplot as plt
from pyprind import ProgBar

%matplotlib inline

In [2]:
API_KEY = '9b47f685a716d06d'
BASE_URL = 'http://api.wunderground.com/api/{}/history_{}/q/TX/Round_Rock.json'

In [3]:
features = [
    "date", "meantempm", "meandewptm", "meanpressurem", "maxhumidity",
    "minhumidity", "maxtempm", "mintempm", "maxdewptm", "mindewptm",
    "maxpressurem", "minpressurem", "precipm"
]
DailySummary = namedtuple('DailySummary', features)

In [4]:
def extract_weather_data(url, api_key, target_date, days):
    """Call Wunderground API to extract weather data."""
    records = []
    bar = ProgBar(days)
    for _ in range(days):
        request = BASE_URL.format(API_KEY, target_date.strftime('%Y%m%d'))
        response = requests.get(request)
        if response.status_code == 200:
            data = response.json()['history']['dailysummary'][0]
            records.append(DailySummary(
                date=target_date,
                meantempm=data['meantempm'],
                meandewptm=data['meandewptm'],
                meanpressurem=data['meanpressurem'],
                maxhumidity=data['maxhumidity'],
                minhumidity=data['minhumidity'],
                maxtempm=data['maxtempm'],
                mintempm=data['mintempm'],
                maxdewptm=data['maxdewptm'],
                mindewptm=data['mindewptm'],
                maxpressurem=data['maxpressurem'],
                minpressurem=data['minpressurem'],
                precipm=data['precipm']))
        time.sleep(6)
        bar.update()
        target_date += timedelta(days=1)
    return records

In [5]:
# Do not run this cell when collecting data on day 2
def get_target_date():
    """Return target date 1000 days prior to current date."""
    current_date = datetime.now()
    target_date = current_date - timedelta(days=1000)
    return target_date

target_date = get_target_date()

In [6]:
records = extract_weather_data(BASE_URL, API_KEY, target_date, 100)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:10:50


In [27]:
# Look at first five records
records[400:405]

[DailySummary(date=datetime.datetime(2016, 10, 4, 0, 0), meantempm='33', meandewptm='27', meanpressurem='1010', maxhumidity='100', minhumidity='54', maxtempm='43', mintempm='23', maxdewptm='32', mindewptm='22', maxpressurem='1011', minpressurem='1008', precipm='0.00'),
 DailySummary(date=datetime.datetime(2016, 10, 5, 0, 0), meantempm='41', meandewptm='32', meanpressurem='1012', maxhumidity='100', minhumidity='62', maxtempm='50', mintempm='33', maxdewptm='32', mindewptm='32', maxpressurem='1014', minpressurem='1011', precipm='0.00'),
 DailySummary(date=datetime.datetime(2016, 10, 6, 0, 0), meantempm='38', meandewptm='30', meanpressurem='1013', maxhumidity='100', minhumidity='57', maxtempm='48', mintempm='28', maxdewptm='32', mindewptm='28', maxpressurem='1015', minpressurem='1011', precipm='0.00'),
 DailySummary(date=datetime.datetime(2016, 10, 7, 0, 0), meantempm='29', meandewptm='24', meanpressurem='1017', maxhumidity='100', minhumidity='55', maxtempm='42', mintempm='17', maxdewptm='

In [28]:
len(records)

500

In [24]:
# Inspect last record to date; next target date should be plus one day
records[-1]

DailySummary(date=datetime.datetime(2016, 10, 3, 0, 0), meantempm='24', meandewptm='18', meanpressurem='1013', maxhumidity='94', minhumidity='41', maxtempm='33', mintempm='17', maxdewptm='22', mindewptm='15', maxpressurem='1016', minpressurem='1011', precipm='0.00')

In [25]:
# set new target date based on date above plus one day
target_date = datetime(2016, 10, 4)

In [26]:
# run this 4 separate times after setting the target date
records += extract_weather_data(BASE_URL, API_KEY, target_date, 100)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:11:14


In [29]:
# save records list
with open('records_pt1.pkl', 'wb') as f:
    pickle.dump(records, f)

In [None]:
# load records list - still need to run cells 1-4
with open('records_pt1.pkl', 'rb') as fp:
    records = pickle.load(fp)

In [None]:
# Inspect last record to date; next target date should be plus one day
records[-1]

In [None]:
# set new target date based on date above plus one day
target_date = datetime(2018, 2, 9)

In [None]:
# run this 5 separate times after setting the target date
records += extract_weather_data(BASE_URL, API_KEY, target_date, 101)

In [None]:
with open('records_pt2.pkl', 'wb') as f:
    pickle.dump(records, f)

In [None]:
# load records list - still need to run cells 1 and 3
with open('records_pt2.pkl', 'rb') as fp:
    records = pickle.load(fp)

In [None]:
df = pd.DataFrame(records, columns=features).set_index('date')

In [None]:
tmp = df[['meantempm', 'meandewptm']].head(10)
tmp

In [None]:
# 1 day prior
N = 1

# target measurement of mean temperature
feature = 'meantempm'

# total number of rows
rows = tmp.shape[0]

# a list representing Nth prior measurements of feature
nth_prior_measurements = tmp[feature].shift(periods=N)

# makee a new column name of feature_N and add to DataFrame
col_name = f'{feature}_{N}'
tmp[col_name] = nth_prior_measurements
tmp

In [None]:
def derive_nth_day_feature(df, feature, N):
    rows = df.shape[0]
    nth_prior_measurements = df[feature].shift(periods=N)
    col_name = f'{feature}_{N}'
    df[col_name] = nth_prior_measurements

In [None]:
for feature in features:
    if feature != 'date':
        for N in range(1, 4):
            derive_nth_day_feature(df, feature, N)

In [None]:
df.columns

In [None]:
# make list of original features without meantempm, mintempm, and maxtempm
to_remove = [feature
             for feature in features
             if feature not in ['meantempm', 'mintempm', 'maxtempm']]

# make a list of columns to keep
to_keep = [col for col in df.columns if col not in to_remove]

# select only the columns in to_keep and assign to df
df = df[to_keep]
df.columns

In [None]:
df.info()

In [None]:
df = df.apply(pd.to_numeric, errors='coerce')
df.info()

In [None]:
# Call describe on df and transpose it due to the large number of columns
spread = df.describe().T

# precalculate interquartile range for ease of use in next calculation
IQR = spread['75%'] - spread['25%']

# create an outliers column which is either 3 IQRs below the first quartile or
# 3 IQRs above the third quartile
spread['outliers'] = (spread['min'] <
                      (spread['25%'] -
                       (3 * IQR))) | (spread['max'] >
                                      (spread['75%'] + 3 * IQR))

# just display the features containing extreame outliers
spread.loc[spread.outliers, ]

In [None]:
fig, ax = plt.subplots(figsize = (14, 8))
ax.hist(df.maxhumidity_1)
ax.set_title('Distribution of maxhumidity_1')
ax.set_xlabel('maxhumidity_1')
ax.grid()

In [None]:
fig, ax = plt.subplots(figsize = (14, 8))
ax.hist(df.minpressurem_1)
ax.set_title('Distribution of minpressurem_1')
ax.set_xlabel('minpressurem_1')
ax.grid()

In [None]:
# iterate over the precip columns
for precip_col in ['precipm_1', 'precipm_2', 'precipm_3']:
    # create a boolean array of values representing nans
    missing_vals = pd.isnull(df[precip_col])
    df[precip_col][missing_vals] = 0

In [None]:
df = df.dropna()

In [None]:
# import pickle
with open('end-part1_df.pkl', 'wb') as f:
    pickle.dump(df, f)