In [None]:
!pip install PyGithub pandas

In [None]:
import numpy as np 
import pandas as pd 
import re 
import sys

In [None]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [None]:
raw_data = pd.read_csv('data/noaa-weather-data-jfk-airport/jfk_weather.csv',
                       parse_dates=['DATE'])
raw_data.head()

In [None]:
raw_data.dtypes

In [None]:
column_subset = [
    'DATE',
    'HOURLYVISIBILITY',
    'HOURLYDRYBULBTEMPF',
    'HOURLYWETBULBTEMPF',
    'HOURLYDewPointTempF',
    'HOURLYRelativeHumidity',
    'HOURLYWindSpeed',
    'HOURLYWindDirection',
    'HOURLYStationPressure',
    'HOURLYPressureTendency',
    'HOURLYSeaLevelPressure',
    'HOURLYPrecip',
    'HOURLYAltimeterSetting'
]
hourly_data = raw_data[column_subset]
hourly_data = hourly_data.set_index(pd.DatetimeIndex(hourly_data['DATE']))
hourly_data.drop(['DATE'],axis=1, inplace=True)
hourly_data.replace(to_replace='*', value=np.nan, inplace=True)



In [None]:
hourly_data.dtypes


In [None]:
hourly_data['HOURLYPrecip'].unique()

In [None]:
hourly_data['HOURLYPrecip'].replace(to_replace='T', value='0.00', inplace=True)
hourly_data['HOURLYPrecip'].replace('0.020.01s',np.nan, inplace=True)


In [None]:
messy_columns = column_subset[1:]
for i in messy_columns:
    hourly_data[i] = hourly_data[i].apply(
        lambda x: re.sub('[^0-9,.-]', '', x)
        if type(x) == str else x).replace('', np.nan).astype(('float32'))

In [None]:
hourly_data.describe()

In [None]:
hourly_data.info()

In [None]:
hourly_data.dtypes

In [None]:
raw_data.head()

In [None]:
hourly_data.head()

In [None]:
cond =\
    len(hourly_data[~hourly_data['HOURLYPressureTendency'].isin(
        list(range(0, 9)) + [np.nan])])

print('Hourly Pressure Tendency should be between 0 and 8: {}'
      .format(cond == 0))

In [None]:
hourly_data[(hourly_data['HOURLYVISIBILITY'] < 0) | (hourly_data['HOURLYVISIBILITY'] > 10)]

In [None]:
hourly_data.loc[hourly_data['HOURLYVISIBILITY'] > 10, 'HOURLYVISIBILITY'] = np.nan
cond = len(hourly_data[(hourly_data['HOURLYVISIBILITY'] < 0) | (hourly_data['HOURLYVISIBILITY'] > 10) ] )
print('Hourly Visibility should be between 0 and 10: {}'.format(cond  == 0))

In [None]:
conda = len(hourly_data[hourly_data.index.duplicated()].sort_index())
print('Date index contains no duplicate entries: {}'.format(cond == 0))

In [None]:
print('Date index is strictly increasing: {}'
     .format(hourly_data.index.is_monotonic_increasing))

In [None]:
hourly_data = hourly_data.resample('60min').last().shift(periods=1)

In [None]:
hourly_data['HOURLYPressureTendency'] =\
    hourly_data['HOURLYPressureTendency'].fillna(method='ffill')
hourly_data = hourly_data.interpolate(method = 'linear')
hourly_data.drop(hourly_data.index[0], inplace = True)

In [None]:
print(hourly_data.info())
print()
hourly_data.head()

In [None]:
hourly_data['HOURLYWindDirectionSin']  = np.sin(hourly_data['HOURLYWindDirection'] * (2. * np.pi / 360))
hourly_data['HOURLYWindDirectionCos']  = np.cos(hourly_data['HOURLYWindDirection'] * (2. * np.pi / 360))
hourly_data.drop(['HOURLYWindDirection'], axis=1, inplace =True)

In [None]:
hourly_data['HOURLYPressureTendencyIncr'] =\
    [1.0 if x in [0, 1, 2, 3]
        else 0.0 for x in hourly_data['HOURLYPressureTendency']] 
hourly_data['HOURLYPressureTendencyDecr'] =\
    [1.0 if x in [5, 6, 7, 8]
     else 0.0 for x in hourly_data['HOURLYPressureTendency']]
hourly_data['HOURLYPressureTendencyConst'] =\
    [1.0 if x == 4
     else 0.0 for x in hourly_data['HOURLYPressureTendency']]
hourly_data.drop(['HOURLYPressureTendency'], axis=1, inplace=True)
hourly_data['HOURLYPressureTendencyIncr'] =\
    hourly_data['HOURLYPressureTendencyIncr'].astype(('float32'))
hourly_data['HOURLYPressureTendencyDecr'] =\
    hourly_data['HOURLYPressureTendencyDecr'].astype(('float32'))
hourly_data['HOURLYPressureTendencyConst'] =\
    hourly_data['HOURLYPressureTendencyConst'].astype(('float32'))

In [None]:
hourly_data.columns

In [None]:
columns_new_name = [
    'visibility',
    'dry_bulb_temp_f',
    'wet_bulb_temp_f',
    'dew_point_temp_f',
    'relative_humidity',
    'wind_speed',
    'station_pressure',
    'sea_level_pressure',
    'precip',
    'altimeter_setting',
    'wind_direction_sin',
    'wind_direction_cos',
    'pressure_tedency_incr',
    'pressure_tedency_decr',
    'pressure_tedency_const'
]

columns_name_map =\
    {c: columns_new_name[i] for i, c in enumerate(hourly_data.columns)}

hourly_data_renamed = hourly_data.rename(columns=columns_name_map)
    

In [None]:
print(hourly_data_renamed.info())
print()
print(hourly_data_renamed.head())

In [None]:
print('# of megabytes held by dataframe: {}'.format(
      str(round(sys.getsizeof(hourly_data_renamed) / 1000000, 2))))
print('# of features: {}'.format(str(hourly_data_renamed.shape[1])))
print('# of observations: {}'.format(str(hourly_data_renamed.shape[0])))
print('Start date: {}'.format(str(hourly_data_renamed.index[0])))
print('End date: {}'.format(str(hourly_data_renamed.index[-1])))
print('# of days: {}'.format(
      str((hourly_data_renamed.index[-1] - hourly_data_renamed.index[0]).days)))
print('# of months: {}'.format(
      str(round((hourly_data_renamed.index[-1] - hourly_data_renamed.index[0]).days / 30, 2))))
print('# of years: {}'.format(
      str(round((hourly_data_renamed.index[-1] - hourly_data_renamed.index[0]).days / 365, 2))))

In [None]:
hourly_data_renamed.to_csv(
    "data/noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv",
    float_format='%g')