In [95]:
import pandas as pd
import csv
import numpy as np

Create the lambda function that cleans up floating point data.

`29.37s` => `29.37`

In [96]:
def is_float_element(x):
    return x.isnumeric or x == '.'

clean_float = lambda x: ''.join(filter(is_float_element, x))

In [97]:
import glob
data_path = 'LCD_Data/WI/data/*.csv'
all_data = pd.concat([pd.read_csv(file_path,
                            usecols=['STATION', 'DATE', 'HourlySkyConditions', 'HourlyStationPressure',
                                'HourlyVisibility', 'HourlyPresentWeatherType'],
                            parse_dates=['DATE'],
                            dtype={'HourlySkyConditions': str, 'HourlyPresentWeatherType': str},
                            converters={'HourlyStationPressure': clean_float, 'HourlyVisibility': clean_float})
                         for file_path in glob.iglob(data_path)])
all_data.head()

Unnamed: 0,STATION,DATE,HourlyPresentWeatherType,HourlySkyConditions,HourlyStationPressure,HourlyVisibility
0,72640594869,2010-01-01 00:15:00,,,29.37,10.0
1,72640594869,2010-01-01 00:35:00,,,29.37,10.0
2,72640594869,2010-01-01 00:55:00,,,29.38,10.0
3,72640594869,2010-01-01 01:15:00,,,29.38,10.0
4,72640594869,2010-01-01 01:36:00,,,29.38,10.0


Convert station ID to shorter form

In [98]:
all_data['STATION'] = all_data['STATION'].map(lambda x: int(str(x)[-5:]))
all_data['STATION'].head()

0    94869
1    94869
2    94869
3    94869
4    94869
Name: STATION, dtype: int64

Convert weather codes to a more less specific form that can be used by the forcast api later. Codes are in the format `AU | AW | MW`. They need to be catagroized into `freezing_rain_heavy, freezing_rain, freezing_rain_light, freezing_drizzle, ice_pellets_heavy, ice_pellets, ice_pellets_light, snow_heavy, snow, snow_light, flurries, tstorm, rain_heavy, rain, rain_light, drizzle, fog_light, fog, cloudy, mostly_cloudy, partly_cloudy, mostly_clear, clear`.

In [99]:
import json
with open('weather_lookup_converter.json') as json_file:
    lookup = json.load(json_file)

def weather_decoder(value):
    if pd.notnull(value):
        for weather_type in lookup:
            for code in lookup[weather_type]:
                if code in value:
                    return weather_type

all_data['weather_type'] = all_data.apply(lambda row: weather_decoder(row.HourlyPresentWeatherType), axis=1)

Breaking up hourly sky conditions column to a cloud string, cloud base height, and cloud cover using another lookup table

In [100]:
with open('cloud_lookup.json') as json_file:
    lookup = json.load(json_file)

def cloud_decoder(value):
    if pd.notnull(value):
        for cloud_type in lookup:
            if cloud_type in value:
                return lookup[cloud_type]
    return lookup["CLR"]
            
all_data['cloud_type'] = all_data.apply(lambda row: cloud_decoder(row.HourlySkyConditions)["cloud_str"], axis=1)
all_data['cloud_cover'] = all_data.apply(lambda row: cloud_decoder(row.HourlySkyConditions)["cloud_cover"], axis=1)

Combine station data (lat, long, elevation) with historic weather data.

In [112]:
station_meta_lookup = pd.read_csv('LCD_Data/WI/stations.csv',
                                usecols=["STATION_ID", "LATITUDE", "LONGITUDE", "ELEVATION_(M)"],
                               dtype={"LATITUDE": float, "LONGITUDE": float})
def get_matching(value):
    for i, row in enumerate(station_meta_lookup["STATION_ID"]):
        if int(row[5:]) == value:
            return i

all_data["LATITUDE"] = all_data.apply(lambda row: get_matching(row.STATION), axis=1)

In [113]:
all_data.head()

Unnamed: 0,STATION,DATE,HourlyPresentWeatherType,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,weather_type,cloud_type,cloud_cover,LATITUDE
0,94869,2010-01-01 00:15:00,,,29.37,10.0,,clear,0.0,32
1,94869,2010-01-01 00:35:00,,,29.37,10.0,,clear,0.0,32
2,94869,2010-01-01 00:55:00,,,29.38,10.0,,clear,0.0,32
3,94869,2010-01-01 01:15:00,,,29.38,10.0,,clear,0.0,32
4,94869,2010-01-01 01:36:00,,,29.38,10.0,,clear,0.0,32
