# Holiday Dataset
For this, we will be extracting holiday information from the 'workalendar' library in python 

In [1]:
# Import our libraries
import pandas as pd
from workalendar.usa import NewYork

In [2]:
# hard coded for just the 2021 and 2022 holiday scedual 
de_calendar = NewYork()

de_calendar = NewYork()
holidays1 = pd.DataFrame(de_calendar.holidays(2021), 
             columns=["date", "holiday"])
# Keep the 'oberved', as they represent public holidays effect on weekdays, when scedualed for the weekend
holidays2 = pd.DataFrame(de_calendar.holidays(2022), 
             columns=["date", "holiday"])

holiday_data = pd.concat([holidays1, holidays2])

In [3]:
# We will treat holidays all the same due to rare occurences 
holiday_data['holiday'] = holiday_data['holiday'].apply(lambda x: bool(x))

In [4]:
# now save the dataset

# might look to convert to eaither public holiday/Christmas or new years
holiday_data.to_csv('../data/curated/holiday_data.csv')

# Weather
This dataset has been requested and downloaded off the https://www.ncdc.noaa.gov/cdo-web/search wesite, with recordings from the JFK INTERNATIONAL AIRPORT, NY US on an hourly cycle, between september 2021 to the end of march 

In [5]:
# Load data 
climate_data = pd.read_csv('../data/datasets/Hourly_climate.csv')

In [6]:
# first remove the columns we do not need 
climate_data = climate_data.loc[:, ['DATE', 'DEW', 'TMP', 'WND']]

In [7]:
# Functions 
#####################
# This is to reformat TMP and DEW attribute 
def tmp_convert(temp):
    if temp[1:] == '9999':
        return np.nan
    else:
        if temp[0] == '+' :
            return float(temp[1:]) * 1 / 10
        else:
            return float(temp[1:]) * -1 / 10
# this is to reformat WND attribute 
def wnd_convert(wnd):
    wnd = wnd.split(',')[-2]
    if wnd == '9999':
        return np.nan
    else:
        return float(wnd) / 10
#################

# now we need to preprocess the data to the format suiting the taxi_dataset attributes
import numpy as np
# converting to usable date
climate_data['DATE'] = climate_data['DATE'].str.replace(r'T', ' ', regex=True).astype('datetime64[ns]')
# round to the nearest hour
climate_data['hourly_timestamp']  = climate_data['DATE'].dt.round('h')

climate_data['DATE'] = climate_data['DATE'].apply(lambda x: x.date())
# now, need to extract the DEW, TMP and WND attributes, and format float(temp[1:]) * -1 / 10

# for temp
climate_data['TMP'] = climate_data['TMP'].str.replace(r',.', '', regex=True)
climate_data['TMP'] = climate_data['TMP'].apply(tmp_convert)
# fir WND
climate_data['WND'] = climate_data['WND'].apply(wnd_convert)
# for dew
climate_data['DEW'] = climate_data['DEW'].str.replace(r',.', '', regex=True)
climate_data['DEW'] = climate_data['DEW'].apply(tmp_convert)

In [8]:
# remove duplicates 
climate_data = climate_data.drop_duplicates(subset=['hourly_timestamp'])

In [9]:
climate_data.isnull().values.any()
# as there are no null values, don't need to imputate

False

In [10]:
# now save data
climate_data.to_csv('../data/curated/Hourly_climate_processed.csv')

# Covid Dataset

In [11]:
# For this, we only care about the 7-day average for each boroughs
covid_7_avg = pd.read_csv('../data/datasets/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv')

In [12]:
covid_7_avg = covid_7_avg.loc[:, ['date_of_interest', 'ALL_CASE_COUNT_7DAY_AVG']]

In [13]:
covid_7_avg['date_of_interest'] = covid_7_avg['date_of_interest'].astype('datetime64[ns]')

In [14]:
# reomove recordings for anything before 2021-09-01
covid_7_avg = covid_7_avg[covid_7_avg['date_of_interest'] >= '2021-09-01']

In [15]:
# now save as csv
covid_7_avg.to_csv('../data/curated/COVID-19_7-AVG.csv')