In [1]:
import json
import requests
import pandas as pd
import datetime
import copy
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
# gathering data from 1/1/2017 - 11/20/2019 for initial analysis;
# forecasts from this API began getting collected on 11/19/2019;
# in reality the time series analysis would need to rely on
# forecasts for future periods, but historical forecasts aren't available,
# so historical periods will have the unfair advantage of using true weather data
# the API calls need to be broken into two batches to stay within
# the free-tier of the API service which is capped at 1000 calls/day 
DAY_LENGTH = 86400
TOTAL_DAY_COUNT = 1054
FIRST_START_DAY = 1483272000
FIRST_PASS_DAY_COUNT = 730
second_start_day = FIRST_START_DAY + FIRST_PASS_DAY_COUNT * DAY_LENGTH
second_pass_day_count = TOTAL_DAY_COUNT - FIRST_PASS_DAY_COUNT
# getting 2015 and 2016 data, which include a leap year
older_start_day = FIRST_START_DAY - (FIRST_PASS_DAY_COUNT + 1) * DAY_LENGTH
older_pass_day_count = FIRST_PASS_DAY_COUNT

In [3]:
first_pass_times = [FIRST_START_DAY]
for day in range(0, FIRST_PASS_DAY_COUNT):
    sample_time = first_pass_times[-1] + DAY_LENGTH
    first_pass_times.append(sample_time)

In [4]:
second_pass_times = [second_start_day]
for day in range(0, second_pass_day_count):
    sample_time = second_pass_times[-1] + DAY_LENGTH
    second_pass_times.append(sample_time)

In [5]:
older_pass_times = [older_start_day]
for day in range(0, older_pass_day_count):
    sample_time = older_pass_times[-1] + DAY_LENGTH
    older_pass_times.append(sample_time)

In [6]:
# gathering API key from hidden location
with open("/Users/natha/.secret/dark_sky_api.json") as api_key_file:
    api_key = str(json.load(api_key_file)['api_key'])

In [7]:
# establishing relevant strings for use in the API call
url_base = 'https://api.darksky.net/forecast/'
location = '38.8483,-77.0342'

In [8]:
def label_historicalType_and_precipType(api_json_data):
    """
    Function loops through the hourly records in the input
    json data to label the data as a historical 'type',
    and to populate the 'precipType' with 'none' if this
    key-value pair is not present, which occurs when there
    was no precipitation at that time.
    """
    data_records = api_json_data['hourly']['data']
    for record in data_records:
        record.update({'type': 'historical'})
        try:
            record.update({'precipType': record['precipType']})
        except:
            record.update({'precipType': 'none'})
    return data_records

In [9]:
def api_dataframe_conversion(json_data, hourly_records, column_headers):
    """
    Function generates a dataframe from the hourly historical
    weather records for the given day and also provides
    locational and type (e.g. historical or forecast) designations.
    """
    data_frame = pd.DataFrame(hourly_records)
    data_frame['time'] = pd.to_datetime(data_frame['time'],unit='s')
    data_frame['latitude'] = json_data['latitude']
    data_frame['longitude'] = json_data['longitude']
    data_frame['timezone'] = json_data['timezone']
    data_frame = data_frame[column_headers]
    data_frame.set_index('time', inplace=True)
    return data_frame

In [10]:
def historical_dataframe_from_api_calls(list_of_times, url_base, api_key, location):
    """
    Function loops through the list of times provided and
    returns a dataframe with hourly data from the date when
    each time occurs.
    """
    # initializing the final dataframe
    column_headers = ['time', 'latitude', 'longitude', 'timezone', 'type', 'summary', 'icon',
                      'precipIntensity', 'precipProbability', 'precipType', 'temperature',
                      'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed',
                      'windGust', 'windBearing', 'cloudCover', 'uvIndex', 'visibility']
    historical_data_frame = pd.DataFrame(columns=column_headers)
    historical_data_frame.set_index('time', inplace=True)
    # looping through the list of times
    for time in list_of_times:
        url = url_base+api_key+'/'+location+','+str(time)+'?exclude=currently,minutely,daily,alerts,flags'
        response = requests.get(url)
        data = response.json()
        hourly_data = label_historicalType_and_precipType(data)
        time_data_frame = api_dataframe_conversion(data, hourly_data, column_headers)
        historical_data_frame = historical_data_frame.append(time_data_frame, sort=False)
    return historical_data_frame

In [11]:
# def label_forecastType_and_precipType(api_json_data):
#     """
#     Function loops through the hourly records in the input
#     json data to label the data as a historical 'type',
#     and to populate the 'precipType' with 'none' if this
#     key-value pair is not present, which occurs when there
#     was no precipitation at that time.
#     """
#     data_records = api_json_data['hourly']['data']
#     for record in data_records:
#         record.update({'type': 'forecast'})
#         try:
#             record.update({'precipType': record['precipType']})
#         except:
#             record.update({'precipType': 'none'})
#     return data_records

In [12]:
# def forecast_dataframe_from_api_calls(list_of_times):
#     """
#     Function loops through the list of times provided and
#     returns a dataframe with hourly data from the date when
#     each time occurs.
#     """
#     # initializing the final dataframe
#     column_headers = ['time', 'latitude', 'longitude', 'timezone', 'type', 'summary', 'icon',
#                       'precipIntensity', 'precipProbability', 'precipType', 'temperature',
#                       'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed',
#                       'windGust', 'windBearing', 'cloudCover', 'uvIndex', 'visibility']
#     forecast_data_frame = pd.DataFrame(columns=column_headers)
#     forecast_data_frame.set_index('time', inplace=True)
#     # looping through the list of times
#     for time in list_of_times:
#         url = url_base+api_key+'/'+location+','+str(time)+'?exclude=currently,minutely,daily,alerts,flags'
#         response = requests.get(url)
#         data = response.json()
#         hourly_data = label_forecastType_and_precipType(data)
#         time_data_frame = api_dataframe_conversion(data, hourly_data, column_headers)
#         forecast_data_frame = forecast_data_frame.append(time_data_frame, sort=False)
#     return forecast_data_frame

In [13]:
# df1 = historical_dataframe_from_api_calls(first_pass_times, url_base, api_key, location)

In [14]:
# df1.to_csv('data/KDCA_weather_data_2017-2018.csv')

In [15]:
# df2 = historical_dataframe_from_api_calls(second_pass_times, url_base, api_key, location)

In [16]:
# df2.to_csv('data/KDCA_weather_data_2019-20191121.csv')

In [17]:
df2 = historical_dataframe_from_api_calls(older_pass_times, url_base, api_key, location)

In [18]:
df2.to_csv('data/KDCA_weather_data_2015-2016.csv')