In [1]:
from datetime import datetime, timedelta
import time
from collections import namedtuple
import pandas as pd
import requests
import pickle
import matplotlib.pyplot as plt
from pyprind import ProgBar

In [2]:
API_KEY = '9b47f685a716d06d'
BASE_URL = 'http://api.wunderground.com/api/{}/history_{}/q/TX/Round_Rock.json'

In [3]:
features = ["date", "meantempm", "meandewptm", "meanpressurem", "maxhumidity", "minhumidity", "maxtempm",
           "mintempm", "maxdewptm", "mindewptm", "maxpressurem", "minpressurem", "precipm"]
DailySummary = namedtuple('DailySummary', features)

In [4]:
# To do - add print function calls to this function to show progress
def extract_weather_data(url, api_key, target_date, days):  
    records = []
    bar = ProgBar(days)
    for _ in range(days):
        request = BASE_URL.format(API_KEY, target_date.strftime('%Y%m%d'))
        response = requests.get(request)
        if response.status_code == 200:
            data = response.json()['history']['dailysummary'][0]
            records.append(DailySummary(
                date=target_date,
                meantempm=data['meantempm'],
                meandewptm=data['meandewptm'],
                meanpressurem=data['meanpressurem'],
                maxhumidity=data['maxhumidity'],
                minhumidity=data['minhumidity'],
                maxtempm=data['maxtempm'],
                mintempm=data['mintempm'],
                maxdewptm=data['maxdewptm'],
                mindewptm=data['mindewptm'],
                maxpressurem=data['maxpressurem'],
                minpressurem=data['minpressurem'],
                precipm=data['precipm']))
        time.sleep(6)
        bar.update()
        target_date += timedelta(days=1)
    return records

In [5]:
# Do not run this cell when collecting data on day 2
def get_target_date():
    """Return target date 1000 days prior to current date."""
    current_date = datetime.now()
    target_date = current_date - timedelta(days=1000)
    return target_date

target_date = get_target_date()

In [6]:
records = extract_weather_data(BASE_URL, API_KEY, target_date, 100)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:12:39


In [None]:
# Look at first five records
records

In [8]:
len(records)

1000

In [None]:
# save records list
# with open('records_pt1.pkl', 'wb') as f:
#     pickle.dump(records, f)

In [None]:
# load records list - still need to run cells 1-4
with open('records_pt1.pkl', 'rb') as fp:
    records = pickle.load(fp)

In [None]:
# Inspect last record to date; next target date should be plus one day
records[-1]

In [None]:
# set new target date based on date above plus one day
target_date = datetime(2018, 2, 9)

In [None]:
# run this 5 separate times after setting the target date
records += extract_weather_data(BASE_URL, API_KEY, target_date, 101)

In [None]:
with open('records_pt2.pkl', 'wb') as f:
    pickle.dump(records, f)

In [7]:
# load records list - still need to run cells 1 and 3
with open('records_pt2.pkl', 'rb') as fp:
    records = pickle.load(fp)

In [9]:
df = pd.DataFrame(records, columns=features).set_index('date')

In [10]:
tmp = df[['meantempm', 'meandewptm']].head(10)
tmp

Unnamed: 0_level_0,meantempm,meandewptm
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-08-24,30,21
2015-08-25,30,21
2015-08-26,28,20
2015-08-27,28,18
2015-08-28,28,17
2015-08-29,27,18
2015-08-30,28,19
2015-08-31,28,19
2015-09-01,27,21
2015-09-02,27,19


In [11]:
# 1 day prior
N = 1

# target measurement of mean temperature
feature = 'meantempm'

# total number of rows
rows = tmp.shape[0]

# a list representing Nth prior measurements of feature
# notice that the front of the list needs to be padded with N
# None values to maintain the consistent rows length for each N
nth_prior_measurements = [None]*N + [tmp[feature][i-N] for i in range(N, rows)]

# makee a new column name of feature_N and add to DataFrame
col_name = f'{feature}_{N}'
tmp[col_name] = nth_prior_measurements
tmp

Unnamed: 0_level_0,meantempm,meandewptm,meantempm_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-08-24,30,21,
2015-08-25,30,21,30.0
2015-08-26,28,20,30.0
2015-08-27,28,18,28.0
2015-08-28,28,17,28.0
2015-08-29,27,18,28.0
2015-08-30,28,19,27.0
2015-08-31,28,19,28.0
2015-09-01,27,21,28.0
2015-09-02,27,19,27.0


In [None]:
def derive_nth_day_feature(df, feature, N):
    rows = df.shape[0]
    nth_prior_measurements = [None]*N + [df[feature][i-N] for i in range(N, rows)]
    col_name = f'{feature}_{N}'
    df[col_name] = nth_prior_measurements

In [None]:
for feature in features:
    if feature != 'date':
        for N in range(1, 4):
            derive_nth_day_feature(df, feature, N)

In [None]:
df.columns

In [None]:
# make list of original features without meantempm, mintempm, and maxtempm
to_remove = [feature
             for feature in features
             if feature not in ['meantempm', 'mintempm', 'maxtempm']]

# make a list of columns to keep
to_keep = [col for col in df.columns if col not in to_remove]

# select only the columns in to_keep and assign to df
df = df[to_keep]
df.columns

In [None]:
df.info()

In [None]:
df = df.apply(pd.to_numeric, errors='coerce')
df.info()

In [None]:
# Call describe on df and transpose it due to the large number of columns
spread = df.describe().T

# precalculate interquartile range for ease of use in next calculation
IQR = spread['75%'] - spread['25%']

# create an outliers column which is either 3 IQRs below the first quartile or
# 3 IQRs above the third quartile
spread['outliers'] = (spread['min']<(spread['25%']-(3*IQR)))|(spread['max'] > (spread['75%']+3*IQR))

# just display the features containing extreame outliers
spread.ix[spread.outliers,]

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [14, 8]
df.maxhumidity_1.hist()
plt.title('Distribution of maxhumidity_1')
plt.xlabel('maxhumidity_1')
plt.show()

In [None]:
df.minpressurem_1.hist()
plt.title('Distribution of minpressurem_1')
plt.xlabel('minpressurem_1')
plt.show()

In [None]:
# iterate over the precip columns
for precip_col in ['precipm_1', 'precipm_2', 'precipm_3']:
    # create a boolean array of values representing nans
    missing_vals = pd.isnull(df[precip_col])
    df[precip_col][missing_vals] = 0

In [None]:
df = df.dropna()

In [None]:
# import pickle
with open('end-part1_df.pkl', 'wb') as f:
    pickle.dump(df, f)