# Preprocess Weather Data 
This notebook performs preprocessing for NYC weather dataset captured at Central Park

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [2]:
# Read NYC weather data 
wdata = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')

  wdata = pd.read_csv('../data/landing/external/NYC_weather_2023.csv')


In [3]:
wdata.head(5)

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,OD1,OE1,OE2,OE3,RH1,RH2,RH3,WA1,REM,EQD
0,72505394728,2023-01-01T00:06:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11812/31/22 19:06:03 SPECI KNYC 010006Z 000...,
1,72505394728,2023-01-01T00:18:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10612/31/22 19:18:03 SPECI KNYC 010018Z 030...,
2,72505394728,2023-01-01T00:31:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET11112/31/22 19:31:03 SPECI KNYC 010031Z 000...,
3,72505394728,2023-01-01T00:44:00,7,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET10212/31/22 19:44:03 SPECI KNYC 010044Z 000...,
4,72505394728,2023-01-01T00:49:00,6,40.77898,-73.96925,42.7,"NY CITY CENTRAL PARK, NY US",FM-16,KNYC,V030,...,,,,,,,,,MET09712/31/22 19:49:03 SPECI KNYC 010049Z 000...,


In [4]:
# count the number of total entries
total = wdata.shape[0]

## Data Cleaning

In [5]:
# Convert the 'Date' column to datetime format
wdata['DATE'] = pd.to_datetime(wdata['DATE'])
# Filter the data between 2023-07-01 and 2023-12-31
wdata = wdata[(wdata['DATE'] >= '2023-07-01') & (wdata['DATE'] <= '2023-12-31')]

In [6]:
# Check columns
print(wdata.columns) 

Index(['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION',
       'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG',
       'VIS', 'TMP', 'DEW', 'SLP', 'AA1', 'AA2', 'AA3', 'AB1', 'AD1', 'AE1',
       'AH1', 'AH2', 'AH3', 'AH4', 'AH5', 'AH6', 'AI1', 'AI2', 'AI3', 'AI4',
       'AI5', 'AI6', 'AJ1', 'AK1', 'AL1', 'AM1', 'AN1', 'AT1', 'AT2', 'AT3',
       'AT4', 'AT5', 'AU1', 'AU2', 'AW1', 'AW2', 'AW3', 'AX1', 'AX2', 'AX3',
       'AX4', 'GA1', 'GA2', 'GA3', 'GD1', 'GD2', 'GD3', 'GE1', 'GF1', 'KA1',
       'KA2', 'KB1', 'KB2', 'KB3', 'KC1', 'KC2', 'KD1', 'KD2', 'KE1', 'KG1',
       'KG2', 'MA1', 'MD1', 'MF1', 'MG1', 'MH1', 'MK1', 'MW1', 'OC1', 'OD1',
       'OE1', 'OE2', 'OE3', 'RH1', 'RH2', 'RH3', 'WA1', 'REM', 'EQD'],
      dtype='object')


In [7]:
# Preserve only the columns we need 
wdata = wdata[['DATE', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP']]
wdata.head(5)

Unnamed: 0,DATE,CIG,WND,VIS,TMP,DEW,SLP
5700,2023-07-01 00:51:00,"22000,5,9,N","999,9,N,9999,9","009656,5,N,5",2395,1335,101715
5701,2023-07-01 01:51:00,"22000,5,9,N","999,9,C,0000,5","009656,5,N,5",2335,1335,101765
5702,2023-07-01 02:51:00,"22000,5,9,N","999,9,C,0000,5","009656,5,N,5",2335,1285,101785
5703,2023-07-01 03:51:00,"22000,5,9,N","230,5,N,0031,5","009656,5,N,5",2285,1285,101775
5704,2023-07-01 04:51:00,"22000,5,9,N","210,5,N,0015,5","009656,5,N,5",2285,1175,101745


In [8]:
# check data types
print(wdata.dtypes)

DATE    datetime64[ns]
CIG             object
WND             object
VIS             object
TMP             object
DEW             object
SLP             object
dtype: object


### Missing Data and duplicates

In [11]:
# check number of entries now 
print(wdata.shape[0])
# record this as total
total = wdata.shape[0]

6110


In [14]:
# check for missing values
print(wdata.isnull().sum())

DATE    0
CIG     0
WND     0
VIS     0
TMP     0
DEW     0
SLP     0
dtype: int64


In [13]:
# Check percetage of data left
remaining = wdata.shape[0]
percentage = (remaining/total)*100
print(f'{percentage}% of the data remains')

100.0% of the data remains


In [11]:
import pandas as pd

# Function to convert string values to float, ignoring quality codes
def convert_to_float(value):
    try:
        # Split the string by commas and take the relevant part
        main_value = value.split(',')[0]
        
        # Check if the main value is not empty
        if main_value:
            # Handle positive/negative signs and leading zeros
            if main_value.startswith('+') or main_value.startswith('-'):
                return float(main_value) / 10
            else:
                return float(main_value.lstrip('0')) / 10
        else:
            return None  # Handle empty strings or invalid values
    except ValueError:
        return None  # Handle cases where conversion to float fails

# Function to extract the ceiling height from the CIG column
def extract_ceiling_height(cig_value):
    parts = cig_value.split(',')
    if len(parts) > 0 and parts[0]:
        return float(parts[0].lstrip('0'))  # Convert the first part to float (ceiling height)
    else:
        return None  # or return a default value like 0.0 if you prefer

# Safe function to extract wind speed and handle any possible format issues
def extract_wind_speed(wnd_value):
    parts = wnd_value.split(',')
    if len(parts) > 3 and parts[3]:
        return convert_to_float(parts[3])  # Wind speed is the 4th part
    else:
        return None  # or return a default value like 0.0 if you prefer

# Apply conversion to the relevant columns in your wdata DataFrame
wdata['CIG'] = wdata['CIG'].apply(extract_ceiling_height)  
wdata['WND'] = wdata['WND'].apply(extract_wind_speed)
wdata['VIS'] = wdata['VIS'].apply(convert_to_float)
wdata['TMP'] = wdata['TMP'].apply(convert_to_float)
wdata['DEW'] = wdata['DEW'].apply(convert_to_float)
wdata['SLP'] = wdata['SLP'].apply(convert_to_float)


In [12]:
# List of weather columns
weather_columns = ['CIG', 'VIS', 'TMP', 'DEW', 'SLP']

# Define the missing value conditions
missing_values = [99999.0, 99999.9, 99999.9, 9999.9, 9999.9, 999.9] 

# Calculate the percentage of missing values for each column
missing_percentage = wdata.apply(lambda col: (col.isin(missing_values).sum() / len(col)) * 100)

# Show the result
print(missing_percentage)

DATE     0.000000
CIG      3.191489
WND      7.135843
VIS      3.126023
TMP      2.995090
DEW      2.995090
SLP     28.363339
dtype: float64


## Imputation

In [13]:
# Replace specific values with NaN
replace_vals = [99999.0, 99999.9, 99999.9, 9999.9, 9999.9, 999.9] 
wdata.replace(replace_vals, np.nan, inplace=True)

# Impute missing values with the mean of each column
wdata['CIG'] = wdata['CIG'].fillna(wdata['CIG'].mean())
wdata['VIS'] = wdata['VIS'].fillna(wdata['VIS'].mean())
wdata['TMP'] = wdata['TMP'].fillna(wdata['TMP'].mean())
wdata['DEW'] = wdata['DEW'].fillna(wdata['DEW'].mean())
wdata['SLP'] = wdata['SLP'].fillna(wdata['SLP'].mean())
wdata['WND'] = wdata['WND'].fillna(wdata['WND'].mean())


In [14]:
# check for 99999.0, 99999.9, 99999.9, 9999.9, 9999.9, 999.9
missing_percentage = wdata.apply(lambda col: (col.isin(missing_values).sum() / len(col)) * 100)
print(missing_percentage) 

DATE    0.0
CIG     0.0
WND     0.0
VIS     0.0
TMP     0.0
DEW     0.0
SLP     0.0
dtype: float64


## Feature Engineering

### Temporal Features

In [15]:
# Extract hour and date from the 'DATE' column
wdata['HOUR'] = wdata['DATE'].dt.hour
wdata['DATE'] = wdata['DATE'].dt.date

In [16]:
# rearrange columns
wdata = wdata[['DATE', 'HOUR', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP']]
wdata.head(5)

Unnamed: 0,DATE,HOUR,CIG,WND,VIS,TMP,DEW,SLP
5700,2023-07-01,0,22000.0,2.631967,965.6,23.9,13.3,1017.1
5701,2023-07-01,1,22000.0,2.631967,965.6,23.3,13.3,1017.6
5702,2023-07-01,2,22000.0,2.631967,965.6,23.3,12.8,1017.8
5703,2023-07-01,3,22000.0,3.1,965.6,22.8,12.8,1017.7
5704,2023-07-01,4,22000.0,1.5,965.6,22.8,11.7,1017.4


In [17]:
# check outliers
wdata.describe()

Unnamed: 0,HOUR,CIG,WND,VIS,TMP,DEW,SLP
count,6110.0,6110.0,6110.0,6110.0,6110.0,6110.0,6110.0
mean,11.297381,11279.596788,2.631967,1265.523822,17.229661,11.535499,1016.097624
std,6.812248,10371.788003,0.949606,502.053348,7.522985,8.571824,6.599489
min,0.0,61.0,1.5,40.2,-3.9,-15.6,983.6
25%,5.0,579.0,2.1,965.6,11.1,6.7,1012.8
50%,11.0,11279.596788,2.631967,1609.3,17.8,12.8,1016.097624
75%,17.0,22000.0,2.631967,1609.3,23.3,18.9,1018.5
max,23.0,22000.0,12.4,1609.3,33.9,24.4,1039.1


## Export to raw data

In [24]:
# Export the cleaned data
wdata.to_csv('../data/raw/NYC_weather_raw.csv', index=False)