# Preprocess Weather Data 

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [2]:
# Read NYC weather data 
wdata_2023 = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')
wdata_2024 = pd.read_csv('../data/landing/external/NYC_weather_2024.csv')

  wdata_2023 = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')


In [3]:
wdata_2023.head(5)

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,OD1,OE1,OE2,OE3,RH1,RH2,RH3,WA1,REM,EQD
0,72505394728,2023-01-01T00:06:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11812/31/22 19:06:03 SPECI KNYC 010006Z 000...,
1,72505394728,2023-01-01T00:18:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10612/31/22 19:18:03 SPECI KNYC 010018Z 030...,
2,72505394728,2023-01-01T00:31:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11112/31/22 19:31:03 SPECI KNYC 010031Z 000...,
3,72505394728,2023-01-01T00:44:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10212/31/22 19:44:03 SPECI KNYC 010044Z 000...,
4,72505394728,2023-01-01T00:49:00,6,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET09712/31/22 19:49:03 SPECI KNYC 010049Z 000...,


In [4]:
wdata_2024.head(5) 

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,OD1,OE1,OE2,OE3,RH1,RH2,RH3,WA1,REM,EQD
0,72505394728,2024-01-01T00:51:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-15,KNYC,V030,...,,,,,,,,,MET10012/31/23 19:51:03 METAR KNYC 010051Z 250...,
1,72505394728,2024-01-01T01:51:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-15,KNYC,V030,...,,,,,,,,,MET09612/31/23 20:51:03 METAR KNYC 010151Z 000...,
2,72505394728,2024-01-01T02:51:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-15,KNYC,V030,...,,,,,,,,,MET10212/31/23 21:51:03 METAR KNYC 010251Z VRB...,
3,72505394728,2024-01-01T03:51:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-15,KNYC,V030,...,,,,,,,,,MET09612/31/23 22:51:03 METAR KNYC 010351Z 250...,
4,72505394728,2024-01-01T04:51:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-15,KNYC,V030,...,,,,,,,,,MET11712/31/23 23:51:03 METAR KNYC 010451Z 260...,


In [5]:
# Combine the two datasets
wdata = pd.concat([wdata_2023, wdata_2024])

## Data Cleaning

In [6]:
# Convert the 'Date' column to datetime format
wdata['DATE'] = pd.to_datetime(wdata['DATE'])
# Filter the data between 2023-12-01 and 2024-5-31
wdata = wdata[(wdata['DATE'] >= '2023-12-01') & (wdata['DATE'] <= '2024-05-31')]

In [7]:
# Check columns
print(wdata.columns) 

Index(['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG',
       'VIS', 'TMP', 'DEW', 'SLP', 'AA1', 'AA2', 'AA3', 'AB1', 'AD1', 'AE1',
       'AH1', 'AH2', 'AH3', 'AH4', 'AH5', 'AH6', 'AI1', 'AI2', 'AI3', 'AI4',
       'AI5', 'AI6', 'AJ1', 'AK1', 'AL1', 'AM1', 'AN1', 'AT1', 'AT2', 'AT3',
       'AT4', 'AT5', 'AU1', 'AU2', 'AW1', 'AW2', 'AW3', 'AX1', 'AX2', 'AX3',
       'AX4', 'GA1', 'GA2', 'GA3', 'GD1', 'GD2', 'GD3', 'GE1', 'GF1', 'KA1',
       'KA2', 'KB1', 'KB2', 'KB3', 'KC1', 'KC2', 'KD1', 'KD2', 'KE1', 'KG1',
       'KG2', 'MA1', 'MD1', 'MF1', 'MG1', 'MH1', 'MK1', 'MW1', 'OC1', 'OD1',
       'OE1', 'OE2', 'OE3', 'RH1', 'RH2', 'RH3', 'WA1', 'REM', 'EQD'],
      dtype='object')


In [8]:
# Preserve only the columns we need 
wdata = wdata[['DATE', 'WND', 'VIS', 'TMP', 'DEW', 'SLP']]
wdata.head(5)

Unnamed: 0,DATE,WND,VIS,TMP,DEW,SLP
10800,2023-12-01 00:51:00,"200,5,N,0041,5","016093,5,N,5",945,-285,102015
10801,2023-12-01 01:51:00,"999,9,V,0021,5","016093,5,N,5",895,-225,102005
10802,2023-12-01 02:51:00,"999,9,V,0031,5","016093,5,N,5",895,-225,102045
10803,2023-12-01 03:51:00,"999,9,V,0031,5","016093,5,N,5",835,-175,102075
10804,2023-12-01 04:51:00,"999,9,V,0031,5","016093,5,N,5",785,-175,102085


In [9]:
# Drop rows with missing values and duplicates
wdata = wdata.dropna()
wdata = wdata.drop_duplicates()

(6024, 6)
