In [1]:
import pandas as pd
import csv
import numpy as np
import os


In [2]:
from tqdm import tqdm
tqdm.pandas(desc="Progress:")

  from pandas import Panel


Create the lambda function that cleans up floating point data.

`29.37s` => `29.37`

In [3]:
import glob

data_path = 'LCD_Data/WI/data/*.csv'
clean_float = lambda x: ''.join(filter(str.isdigit or x == '.', x))

raw_data = pd.concat([pd.read_csv(file_path,
                            usecols=['STATION', 'DATE', 'HourlySkyConditions', 'HourlyStationPressure',
                                'HourlyVisibility', 'HourlyPresentWeatherType', 'HourlyPrecipitation'],
                            parse_dates=['DATE'],
                            dtype={'HourlySkyConditions': str, 'HourlyPresentWeatherType': str},
                            converters={'HourlyStationPressure': clean_float, 'HourlyVisibility': clean_float,
                                        'HourlyPrecipitation': clean_float})
                         for file_path in glob.iglob(data_path)])
raw_data.shape

(1265787, 7)

Sample data for faster development

In [4]:
w_data = raw_data
#w_data = w_data.sample(n=10000)
w_data.head()

Unnamed: 0,STATION,DATE,HourlyPrecipitation,HourlyPresentWeatherType,HourlySkyConditions,HourlyStationPressure,HourlyVisibility
0,72640594869,2010-01-01 00:15:00,,,,2937,1000
1,72640594869,2010-01-01 00:35:00,,,,2937,1000
2,72640594869,2010-01-01 00:55:00,,,,2938,1000
3,72640594869,2010-01-01 01:15:00,,,,2938,1000
4,72640594869,2010-01-01 01:36:00,,,,2938,1000


Convert station ID to shorter form

In [5]:
w_data['STATION'] = w_data['STATION'].map(lambda x: int(str(x)[-5:]))
w_data['STATION'].head()

0    94869
1    94869
2    94869
3    94869
4    94869
Name: STATION, dtype: int64

Convert weather codes to a more less specific form that can be used by the forcast api later. Codes are in the format `AU | AW | MW`. They need to be catagroized into `freezing_rain_heavy, freezing_rain, freezing_rain_light, freezing_drizzle, ice_pellets_heavy, ice_pellets, ice_pellets_light, snow_heavy, snow, snow_light, flurries, tstorm, rain_heavy, rain, rain_light, drizzle, fog_light, fog, cloudy, mostly_cloudy, partly_cloudy, mostly_clear, clear`.

In [6]:
import json
with open('weather_lookup_converter.json') as json_file:
    lookup = json.load(json_file)

def weather_decoder(value):
    if pd.notnull(value):
        for weather_type in lookup:
            for code in lookup[weather_type]:
                if code in value:
                    return weather_type

w_data['weather_type'] = w_data.progress_apply(lambda row: weather_decoder(row.HourlyPresentWeatherType), axis=1)

Progress:: 100%|██████████| 1265787/1265787 [00:31<00:00, 39686.25it/s]


Breaking up hourly sky conditions column to a cloud string, cloud base height, and cloud cover using another lookup table

In [7]:
with open('cloud_lookup.json') as json_file:
    lookup = json.load(json_file)

def cloud_decoder(value):
    if pd.notnull(value):
        for cloud_type in lookup:
            if cloud_type in value:
                return lookup[cloud_type]
    return lookup["CLR"]
            
w_data['cloud_type'] = w_data.progress_apply(lambda row: cloud_decoder(row.HourlySkyConditions)["cloud_str"], axis=1)
w_data['cloud_cover'] = w_data.progress_apply(lambda row: cloud_decoder(row.HourlySkyConditions)["cloud_cover"], axis=1)

Progress:: 100%|██████████| 1265787/1265787 [00:30<00:00, 42165.39it/s]
Progress:: 100%|██████████| 1265787/1265787 [00:29<00:00, 42306.91it/s]


Combine station data (lat, long, elevation) with historic weather data.

In [None]:
station_meta_lookup = pd.read_csv('LCD_Data/WI/stations.csv',
                                usecols=["STATION_ID", "STATION", "LATITUDE", "LONGITUDE", "ELEVATION_(M)"],
                               dtype={"LATITUDE": float, "LONGITUDE": float})

def get_matching_row(value):
    for i, row in enumerate(station_meta_lookup["STATION_ID"]):
        if int(row[5:]) == value:
            return i

w_data["latitude"] = w_data.progress_apply(lambda row:
                                      station_meta_lookup["LATITUDE"][get_matching_row(row.STATION)], axis=1)
w_data["longitude"] = w_data.progress_apply(lambda row:
                                      station_meta_lookup["LONGITUDE"][get_matching_row(row.STATION)], axis=1)
w_data["elevation"] = w_data.progress_apply(lambda row:
                                      station_meta_lookup["ELEVATION_(M)"][get_matching_row(row.STATION)], axis=1)
w_data.head()

Progress::  59%|█████▊    | 740834/1265787 [00:50<00:32, 16139.21it/s]

Use altair to visulize where the statations are. Each stattion has an average of about 10 mi for range. We will need to use spatial interpolation to fill the data.

In [None]:
import altair as alt

state = alt.topo_feature('LCD_Data/WI/WI.geo.json', 'collection')

# US states background
background = alt.Chart(state).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa')

# airport positions on background
points = alt.Chart(station_meta_lookup).mark_circle(
    size=50,
    color='red'
).encode(
    longitude='LONGITUDE:Q',
    latitude='LATITUDE:Q',
    tooltip=['STATION:N'])

background + points

Load the NREL api to get solar data for each point.

In [None]:
def create_url(lat, lon, year, api_key, attributes, your_name, reason_for_use, your_affiliation, your_email):
    return 'http://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT({lon}%20{lat})&names={year}&leap_day=false&interval=60&utc=false&full_name={name}&email={email}&affiliation={affiliation}&mailing_list=false&reason={reason}&api_key={api}&attributes={attributes}'\
    .format(year=year, lat=lat, lon=lon,
        name=your_name, email=your_email,
        affiliation=your_affiliation,
        reason=reason_for_use, api=api_key,
        attributes=attributes)

import time  
data_year = '2010'
for step, point in tqdm(enumerate(station_meta_lookup.iterrows())):    
    url = create_url(lat=str(point[1][2]),
                     lon=str(point[1][3]),
                     api_key=os.getenv('MY_VAR'),
                     attributes="air_temperature,dew_point,dhi,dni,ghi,relative_humidity,wind_direction,wind_speed",
                     year=data_year,
                     your_name='Gage+Krumbach',
                     reason_for_use='demo',
                     your_affiliation='my+institution',
                     your_email='gkrumbac@redhat.com')
    output = pd.read_csv(url, skiprows=2)
    output["STATION"] = int(point[1][0][5:])
    output["DATE"] =pd.date_range('1/1/{yr}'.format(yr=data_year),
                                                    freq='60'+'Min',
                                                    periods=525600/60)
    
    if step==0:
        solar_data = output
    else:
        solar_data = solar_data.append(output)
    
    time.sleep(3)

In [None]:
df_merge_asof = pd.merge_asof(solar_data.sort_values(by=['DATE']), w_data.sort_values(by=['DATE']),
              on='DATE',
              by='STATION',
             direction='nearest')

In [None]:
is_station = df_merge_asof['STATION']==94929
df_merge_asof[is_station].loc[(df_merge_asof[is_station]['DATE'] > '01-01-2010') & (df_merge_asof[is_station]['DATE'] < '01-2-2010')]

In [None]:
df_merge_asof = df_merge_asof[['STATION', 'DATE', 'latitude', 'longitude', 'elevation',
                               'Temperature', 'Dew Point', 'Relative Humidity', 'HourlyStationPressure',
                               'Wind Direction', 'Wind Speed', 'HourlyVisibility', 'weather_type',
                               'cloud_type', 'cloud_cover', 'DHI', 'DNI', 'GHI'
                              ]]
df_merge_asof.head()