# NOAA Weather Data Scrape

In this section, we will retrieve and query the weather data in the NYC through NOAA (National Oceanic and Atmospheric Administration). 

### Aim: 
- Count and categorise available buildings based on the Zones provided from TLC dataset.

### Data dictionary:
- Can be retrieved from this link: [PLUTO Data Dictionary](https://data.cityofnewyork.us/api/views/64uk-42ks/files/4236b586-c55e-4358-a21a-3647735dd58b?download=true&filename=pluto_datadictionary.pdf) 

In [1]:
# Token: prFURygHhcjchMdwFdWXiQwJyTzpWoDf
# GHCND:USW00094728

In [3]:
import requests
import pandas as pd
import json
import numpy as np
import datetime

TOKEN = 'prFURygHhcjchMdwFdWXiQwJyTzpWoDf'
STATION_ID = 'GHCND:USW00094728'

In [209]:
df2021 = pd.read_csv("/Users/oliver/Downloads/72505394728-2.csv")
df2022 = pd.read_csv("/Users/oliver/Downloads/72505394728.csv")

In [210]:
df = pd.concat([df2021, df2022])

In [211]:
df.columns

Index(['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG',
       'VIS', 'TMP', 'DEW', 'SLP', 'AA1', 'AA2', 'AA3', 'AB1', 'AD1', 'AE1',
       'AH1', 'AH2', 'AH3', 'AH4', 'AH5', 'AH6', 'AI1', 'AI2', 'AI3', 'AI4',
       'AI5', 'AI6', 'AJ1', 'AK1', 'AM1', 'AN1', 'AT1', 'AT2', 'AT3', 'AT4',
       'AT5', 'AT6', 'AT7', 'AU1', 'AU2', 'AW1', 'AW2', 'AW3', 'AX1', 'AX2',
       'AX3', 'AX4', 'GA1', 'GA2', 'GA3', 'GD1', 'GD2', 'GD3', 'GE1', 'GF1',
       'GJ1', 'KA1', 'KA2', 'KB1', 'KB2', 'KB3', 'KC1', 'KC2', 'KD1', 'KD2',
       'KE1', 'KG1', 'KG2', 'MA1', 'MD1', 'MF1', 'MG1', 'MH1', 'MK1', 'MW1',
       'OC1', 'OD1', 'OE1', 'OE2', 'OE3', 'RH1', 'RH2', 'RH3', 'WA1', 'REM',
       'EQD', 'AL1'],
      dtype='object')

In [212]:
(31+30+31+31+28+31+30)*24

5088

In [213]:
def preprocess(hourly_data):
    # Get the hourly weather report type
    df = hourly_data.loc[hourly_data['REPORT_TYPE'] == 'FM-15', :]
    
    # Extract the unscaled values for each column
    df.loc[:,'WND'] = df['WND'].apply(lambda x: int(x.split(',')[-2])/10).replace(999.9, np.nan)
    df.loc[:,'TMP'] = df['TMP'].apply(lambda x: int(x.split(',')[0])/10).replace(999.9, np.nan)
    df.loc[:,'DEW'] = df['DEW'].apply(lambda x: int(x.split(',')[0])/10).replace(999.9, np.nan)
    df.loc[:,'SLP'] = df['SLP'].apply(lambda x: int(x.split(',')[0])/10).replace(9999.9, np.nan)
    df.loc[:,'AA1'] = df['AA1'].apply(lambda x: np.nan if x != x else int(x.split(',')[1])/10).replace(999.9, np.nan)
    
    
    # Impute missing data using data from an hour before
    df.ffill(inplace=True)
    
    # Filter data to period between 2021-10 to 2022-04
#     df.loc[df['A'] > 2, 'B']
    processed_data = df.loc[(df['DATE'] <= '2022-05-01') & (df['DATE'] >= '2021-10-01'), :]
    
    return processed_data[['DATE',
                           'LATITUDE',
                           'LONGITUDE',
                           'ELEVATION',
                           'NAME',
                           'TMP',
                           'DEW',
                           'SLP',
                           'AA1']]
    

In [214]:
df2 = preprocess(df)

In [215]:
df2

Unnamed: 0,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TMP,DEW,SLP,AA1
8835,2021-10-01T00:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",16.1,6.7,1021.5,0.0
8836,2021-10-01T01:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",16.1,7.2,1022.1,0.0
8837,2021-10-01T02:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",14.4,7.8,1022.4,0.0
8838,2021-10-01T03:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",13.3,7.2,1022.3,0.0
8839,2021-10-01T04:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",12.8,7.8,1022.3,0.0
...,...,...,...,...,...,...,...,...,...
3871,2022-04-30T19:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",19.4,-8.3,1016.8,0.0
3872,2022-04-30T20:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",18.9,-9.4,1016.9,0.0
3873,2022-04-30T21:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",18.9,-8.3,1016.9,0.0
3874,2022-04-30T22:51:00,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",16.1,-3.9,1017.4,0.0
