# Preprocess Weather Data 
This notebook performs preprocessing for NYC weather dataset captured at Central Park

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [3]:
# Read NYC weather data 
wdata_2023 = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')
wdata_2024 = pd.read_csv('../data/landing/external/NYC_weather_2024.csv')

  wdata_2023 = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')


In [4]:
# Combine the two datasets
wdata = pd.concat([wdata_2023, wdata_2024])

In [5]:
wdata.head(5)

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,OD1,OE1,OE2,OE3,RH1,RH2,RH3,WA1,REM,EQD
0,72505394728,2023-01-01T00:06:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11812/31/22 19:06:03 SPECI KNYC 010006Z 000...,
1,72505394728,2023-01-01T00:18:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10612/31/22 19:18:03 SPECI KNYC 010018Z 030...,
2,72505394728,2023-01-01T00:31:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11112/31/22 19:31:03 SPECI KNYC 010031Z 000...,
3,72505394728,2023-01-01T00:44:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10212/31/22 19:44:03 SPECI KNYC 010044Z 000...,
4,72505394728,2023-01-01T00:49:00,6,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET09712/31/22 19:49:03 SPECI KNYC 010049Z 000...,


## Data Cleaning

In [6]:
# Convert the 'Date' column to datetime format
wdata['DATE'] = pd.to_datetime(wdata['DATE'])
# Filter the data between 2023-12-01 and 2024-5-31
wdata = wdata[(wdata['DATE'] >= '2023-12-01') & (wdata['DATE'] <= '2024-05-31')]

In [7]:
# Check columns
print(wdata.columns) 

Index(['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG',
       'VIS', 'TMP', 'DEW', 'SLP', 'AA1', 'AA2', 'AA3', 'AB1', 'AD1', 'AE1',
       'AH1', 'AH2', 'AH3', 'AH4', 'AH5', 'AH6', 'AI1', 'AI2', 'AI3', 'AI4',
       'AI5', 'AI6', 'AJ1', 'AK1', 'AL1', 'AM1', 'AN1', 'AT1', 'AT2', 'AT3',
       'AT4', 'AT5', 'AU1', 'AU2', 'AW1', 'AW2', 'AW3', 'AX1', 'AX2', 'AX3',
       'AX4', 'GA1', 'GA2', 'GA3', 'GD1', 'GD2', 'GD3', 'GE1', 'GF1', 'KA1',
       'KA2', 'KB1', 'KB2', 'KB3', 'KC1', 'KC2', 'KD1', 'KD2', 'KE1', 'KG1',
       'KG2', 'MA1', 'MD1', 'MF1', 'MG1', 'MH1', 'MK1', 'MW1', 'OC1', 'OD1',
       'OE1', 'OE2', 'OE3', 'RH1', 'RH2', 'RH3', 'WA1', 'REM', 'EQD'],
      dtype='object')


In [8]:
# Preserve only the columns we need 
wdata = wdata[['DATE', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP']]
wdata.head(5)

Unnamed: 0,DATE,CIG,WND,VIS,TMP,DEW,SLP
10800,2023-12-01 00:51:00,"22000,5,9,N","200,5,N,0041,5","016093,5,N,5",945,-285,102015
10801,2023-12-01 01:51:00,"22000,5,9,N","999,9,V,0021,5","016093,5,N,5",895,-225,102005
10802,2023-12-01 02:51:00,"22000,5,9,N","999,9,V,0031,5","016093,5,N,5",895,-225,102045
10803,2023-12-01 03:51:00,"22000,5,9,N","999,9,V,0031,5","016093,5,N,5",835,-175,102075
10804,2023-12-01 04:51:00,"22000,5,9,N","999,9,V,0031,5","016093,5,N,5",785,-175,102085


In [9]:
# check data types
print(wdata.dtypes)

DATE    datetime64[ns]
CIG             object
WND             object
VIS             object
TMP             object
DEW             object
SLP             object
dtype: object


### Missing Data and duplicates

In [10]:
# Drop rows with missing values and duplicates
wdata = wdata.dropna()
wdata = wdata.drop_duplicates() 

In [11]:
import pandas as pd

# Function to convert string values to float, ignoring quality codes
def convert_to_float(value):
    try:
        # Split the string by commas and take the relevant part
        main_value = value.split(',')[0]
        
        # Check if the main value is not empty
        if main_value:
            # Handle positive/negative signs and leading zeros
            if main_value.startswith('+') or main_value.startswith('-'):
                return float(main_value) / 10
            else:
                return float(main_value.lstrip('0')) / 10
        else:
            return None  # Handle empty strings or invalid values
    except ValueError:
        return None  # Handle cases where conversion to float fails

# Function to extract the ceiling height from the CIG column
def extract_ceiling_height(cig_value):
    parts = cig_value.split(',')
    if len(parts) > 0 and parts[0]:
        return float(parts[0].lstrip('0'))  # Convert the first part to float (ceiling height)
    else:
        return None  # or return a default value like 0.0 if you prefer

# Safe function to extract wind speed and handle any possible format issues
def extract_wind_speed(wnd_value):
    parts = wnd_value.split(',')
    if len(parts) > 3 and parts[3]:
        return convert_to_float(parts[3])  # Wind speed is the 4th part
    else:
        return None  # or return a default value like 0.0 if you prefer

# Apply conversion to the relevant columns in your wdata DataFrame
wdata['CIG'] = wdata['CIG'].apply(extract_ceiling_height)  
wdata['WND'] = wdata['WND'].apply(extract_wind_speed)
wdata['VIS'] = wdata['VIS'].apply(convert_to_float)
wdata['TMP'] = wdata['TMP'].apply(convert_to_float)
wdata['DEW'] = wdata['DEW'].apply(convert_to_float)
wdata['SLP'] = wdata['SLP'].apply(convert_to_float)


## Feature Engineering

### Temporal Features

In [12]:
# Extract hour and date from the 'DATE' column
wdata['HOUR'] = wdata['DATE'].dt.hour
wdata['DATE'] = wdata['DATE'].dt.date

In [13]:
# rearrange columns
wdata = wdata[['DATE', 'HOUR', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP']]
wdata.head(5)

Unnamed: 0,DATE,HOUR,CIG,WND,VIS,TMP,DEW,SLP
10800,2023-12-01,0,22000.0,4.1,1609.3,9.4,-2.8,1020.1
10801,2023-12-01,1,22000.0,2.1,1609.3,8.9,-2.2,1020.0
10802,2023-12-01,2,22000.0,3.1,1609.3,8.9,-2.2,1020.4
10803,2023-12-01,3,22000.0,3.1,1609.3,8.3,-1.7,1020.7
10804,2023-12-01,4,22000.0,3.1,1609.3,7.8,-1.7,1020.8


In [14]:
# check outliers
wdata.describe()

Unnamed: 0,HOUR,CIG,WND,VIS,TMP,DEW,SLP
count,6024.0,6024.0,5130.0,6024.0,6024.0,6024.0,6024.0
mean,11.215969,12094.48174,128.060468,4255.780362,38.766368,38.439741,3488.458715
std,6.818342,18679.030021,330.073027,16908.030531,169.777815,186.927909,4012.955824
min,0.0,61.0,1.5,40.2,-8.3,-19.4,982.1
25%,5.0,396.0,2.1,965.6,4.4,-3.3,1012.3
50%,11.0,1829.0,3.1,1609.3,8.3,3.3,1019.7
75%,17.0,22000.0,4.6,1609.3,13.3,8.3,9999.9
max,23.0,99999.0,999.9,99999.9,999.9,999.9,9999.9


## Export to raw data

In [25]:
# Export the cleaned data
wdata.to_csv('../data/raw/NYC_weather_raw.csv', index=False)