# Weather Data Transformation

This notebook focuses on transforming the cleaned weather data through feature engineering, encoding, and preparation for modeling.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from datetime import datetime, timedelta

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Cleaned Data

In [3]:
# Load the cleaned weather data
df = pd.read_csv("../../data/processed/weather/NewYork_Weather 2023-01-01 to 2025-03-03_Cleaned.csv")

# Convert datetime columns back to datetime format
df['datetime'] = pd.to_datetime(df['datetime'])
df['sunrise'] = pd.to_datetime(df['sunrise'])
df['sunset'] = pd.to_datetime(df['sunset'])

print(f"Data shape: {df.shape}")
df.head()

Data shape: (793, 33)


Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,NewYork,2023-01-01,24.0,10.0,16.7,24.0,10.0,16.7,9.7,65.2,...,16.8,7,10,2023-01-01 07:06:52,2023-01-01 17:53:11,0.3,Clear,Clear conditions throughout the day.,clear-day,"42752099999,42662099999,42754099999,remote,VAID"
1,NewYork,2023-01-02,22.0,8.0,15.4,22.0,7.9,15.3,8.2,64.1,...,17.2,7,10,2023-01-02 07:07:10,2023-01-02 17:53:49,0.34,Clear,Clear conditions throughout the day.,clear-day,"42752099999,42662099999,42754099999,remote,VAID"
2,NewYork,2023-01-03,20.7,9.0,14.7,20.7,7.3,14.6,7.7,64.5,...,17.2,7,10,2023-01-03 07:07:28,2023-01-03 17:54:28,0.37,Clear,Clear conditions throughout the day.,clear-day,"42752099999,42662099999,42754099999,remote,VAID"
3,NewYork,2023-01-04,19.0,7.0,13.0,19.0,4.8,12.6,6.1,65.6,...,17.4,7,10,2023-01-04 07:07:44,2023-01-04 17:55:08,0.41,Clear,Clear conditions throughout the day.,clear-day,"42752099999,42662099999,42754099999,remote,VAID"
4,NewYork,2023-01-05,19.0,8.0,13.5,19.0,5.3,12.8,8.2,71.2,...,14.5,6,10,2023-01-05 07:07:58,2023-01-05 17:55:48,0.44,Clear,Clear conditions throughout the day.,clear-day,"42752099999,42662099999,42754099999,remote,VAID"


## 2. Temporal Feature Engineering

In [4]:
# Extract time-based features
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['day_of_week'] = df['datetime'].dt.dayofweek  # Monday=0, Sunday=6
df['day_of_year'] = df['datetime'].dt.dayofyear
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['quarter'] = df['datetime'].dt.quarter

# Create season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['month'].apply(get_season)

# Day length in hours
df['day_length'] = (df['sunset'] - df['sunrise']).dt.total_seconds() / 3600

# Display new features
df[['datetime', 'month', 'day', 'day_of_week', 'is_weekend', 'season', 'day_length']].head()

Unnamed: 0,datetime,month,day,day_of_week,is_weekend,season,day_length
0,2023-01-01,1,1,6,1,Winter,10.771944
1,2023-01-02,1,2,0,0,Winter,10.7775
2,2023-01-03,1,3,1,0,Winter,10.783333
3,2023-01-04,1,4,2,0,Winter,10.79
4,2023-01-05,1,5,3,0,Winter,10.797222


## 3. Weather Condition Features

In [5]:
# Create weather condition indicators
df['is_clear'] = df['conditions'].str.contains('Clear', case=False).astype(int)
df['is_rainy'] = df['conditions'].str.contains('Rain', case=False).astype(int)
df['is_cloudy'] = df['conditions'].str.contains('Cloud|Fog', case=False).astype(int)
df['is_snowy'] = df['conditions'].str.contains('Snow', case=False).astype(int)
df['is_stormy'] = df['conditions'].str.contains('Storm|Thunder', case=False).astype(int)

# Create precipitation indicators
df['has_precipitation'] = (df['precip'] > 0).astype(int)
df['heavy_precipitation'] = (df['precip'] > df['precip'].quantile(0.9)).astype(int)

# Create temperature indicators
df['freezing'] = (df['temp'] <= 0).astype(int)
df['hot'] = (df['temp'] >= df['temp'].quantile(0.9)).astype(int)
df['cold'] = (df['temp'] <= df['temp'].quantile(0.1)).astype(int)

# Temperature range for the day
df['temp_range'] = df['tempmax'] - df['tempmin']

# Display new features
weather_cols = ['datetime', 'is_clear', 'is_rainy', 'is_cloudy', 'is_snowy', 'is_stormy', 
               'has_precipitation', 'heavy_precipitation', 'freezing', 'hot', 'cold', 'temp_range']
df[weather_cols].head(10)

Unnamed: 0,datetime,is_clear,is_rainy,is_cloudy,is_snowy,is_stormy,has_precipitation,heavy_precipitation,freezing,hot,cold,temp_range
0,2023-01-01,1,0,0,0,0,0,0,0,0,1,14.0
1,2023-01-02,1,0,0,0,0,0,0,0,0,1,14.0
2,2023-01-03,1,0,0,0,0,0,0,0,0,1,11.7
3,2023-01-04,1,0,0,0,0,0,0,0,0,1,12.0
4,2023-01-05,1,0,0,0,0,0,0,0,0,1,11.0
5,2023-01-06,1,0,0,0,0,0,0,0,0,1,14.0
6,2023-01-07,1,0,0,0,0,0,0,0,0,1,15.0
7,2023-01-08,1,0,0,0,0,0,0,0,0,1,14.0
8,2023-01-09,1,0,0,0,0,0,0,0,0,1,15.8
9,2023-01-10,1,0,0,0,0,0,0,0,0,1,17.2


## 4. Cyclic Features

In [6]:
# Create cyclic features for month, day of week and day of year
# These are useful because they preserve the circular nature of time

# Month of the year (1-12) -> cyclic
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Day of the week (0-6) -> cyclic
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Day of the year (1-365) -> cyclic
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

# Display cyclic features
cyclic_cols = ['datetime', 'month', 'month_sin', 'month_cos', 'day_of_week', 
               'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos']
df[cyclic_cols].head()

Unnamed: 0,datetime,month,month_sin,month_cos,day_of_week,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos
0,2023-01-01,1,0.5,0.866025,6,-0.781831,0.62349,0.017213,0.999852
1,2023-01-02,1,0.5,0.866025,0,0.0,1.0,0.034422,0.999407
2,2023-01-03,1,0.5,0.866025,1,0.781831,0.62349,0.05162,0.998667
3,2023-01-04,1,0.5,0.866025,2,0.974928,-0.222521,0.068802,0.99763
4,2023-01-05,1,0.5,0.866025,3,0.433884,-0.900969,0.085965,0.996298


## 5. Moving Averages and Trends

In [7]:
# Create rolling averages for key weather variables
window_sizes = [3, 7, 14]

# Sort by datetime to ensure correct calculation
df = df.sort_values('datetime')

# Calculate rolling statistics
for window in window_sizes:
    # Rolling averages
    df[f'temp_rolling_{window}d'] = df['temp'].rolling(window=window, min_periods=1).mean()
    df[f'precip_rolling_{window}d'] = df['precip'].rolling(window=window, min_periods=1).mean()
    df[f'humidity_rolling_{window}d'] = df['humidity'].rolling(window=window, min_periods=1).mean()
    
    # Rolling sums for precipitation
    df[f'precip_sum_{window}d'] = df['precip'].rolling(window=window, min_periods=1).sum()
    
    # Rolling max and min for temperature
    df[f'temp_max_{window}d'] = df['temp'].rolling(window=window, min_periods=1).max()
    df[f'temp_min_{window}d'] = df['temp'].rolling(window=window, min_periods=1).min()

# Display some of the rolling statistics
rolling_cols = ['datetime', 'temp', 'temp_rolling_3d', 'temp_rolling_7d', 
                'precip', 'precip_sum_3d', 'precip_sum_7d']
df[rolling_cols].head(10)

Unnamed: 0,datetime,temp,temp_rolling_3d,temp_rolling_7d,precip,precip_sum_3d,precip_sum_7d
0,2023-01-01,16.7,16.7,16.7,0.0,0.0,0.0
1,2023-01-02,15.4,16.05,16.05,0.0,0.0,0.0
2,2023-01-03,14.7,15.6,15.6,0.0,0.0,0.0
3,2023-01-04,13.0,14.366667,14.95,0.0,0.0,0.0
4,2023-01-05,13.5,13.733333,14.66,0.0,0.0,0.0
5,2023-01-06,16.1,14.2,14.9,0.0,0.0,0.0
6,2023-01-07,17.8,15.8,15.314286,0.0,0.0,0.0
7,2023-01-08,17.4,17.1,15.414286,0.0,0.0,0.0
8,2023-01-09,16.7,17.3,15.6,0.0,0.0,0.0
9,2023-01-10,16.6,16.9,15.871429,0.0,0.0,0.0


## 6. Scaling and Normalization

In [8]:
# Select numerical columns to normalize
numerical_features = ['temp', 'tempmax', 'tempmin', 'feelslike', 'humidity', 'precip', 
                       'precipprob', 'windspeed', 'cloudcover', 'visibility', 
                       'solarenergy', 'uvindex', 'temp_range', 'day_length']

# Create scaled versions of these features
# StandardScaler (Z-score normalization)
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numerical_features] = scaler.fit_transform(df[numerical_features])

# Also create min-max scaled versions (0-1 range)
min_max_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])

# Display the first few rows of the scaled data
df_scaled[['datetime'] + numerical_features[:5]].head()

Unnamed: 0,datetime,temp,tempmax,tempmin,feelslike,humidity
0,2023-01-01,-1.746681,-1.368436,-1.743049,-1.688067,0.199742
1,2023-01-02,-2.04023,-1.79306,-2.136967,-1.983847,0.141983
2,2023-01-03,-2.198294,-2.069066,-1.940008,-2.131737,0.162986
3,2023-01-04,-2.582165,-2.429996,-2.333926,-2.55428,0.220745
4,2023-01-05,-2.469262,-2.429996,-2.136967,-2.512026,0.514788


## 7. Categorical Encoding

In [9]:
# Categorical variables to encode
categorical_features = ['conditions', 'preciptype', 'season']

# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=False)

# Display the first few rows of the encoded data with some of the new columns
encoded_cols = ['datetime']
for col in categorical_features:
    encoded_cols.extend([c for c in df_encoded.columns if c.startswith(col + '_')][:3])
    
df_encoded[encoded_cols].head()

Unnamed: 0,datetime,conditions_Clear,conditions_Partially cloudy,conditions_Rain,preciptype_rain,season_Fall,season_Spring,season_Summer
0,2023-01-01,True,False,False,False,False,False,False
1,2023-01-02,True,False,False,False,False,False,False
2,2023-01-03,True,False,False,False,False,False,False
3,2023-01-04,True,False,False,False,False,False,False
4,2023-01-05,True,False,False,False,False,False,False


## 8. Feature Selection

In [10]:
# Create sets of features for different modeling purposes

# Basic features (minimal set for simple models)
basic_features = ['datetime', 'temp', 'precip', 'humidity', 'windspeed', 'cloudcover', 
                  'month', 'day_of_week', 'is_weekend', 'season']

# Intermediate features (adding engineered features)
intermediate_features = basic_features + [
    'temp_range', 'day_length', 'is_clear', 'is_rainy', 'is_cloudy', 'is_snowy',
    'has_precipitation', 'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos'
]

# Advanced features (all engineered features including rolling statistics)
advanced_features = intermediate_features + [
    'temp_rolling_7d', 'temp_max_7d', 'temp_min_7d',
    'precip_sum_7d', 'humidity_rolling_7d',
    'hot', 'cold', 'heavy_precipitation',
    'day_of_year_sin', 'day_of_year_cos'
]

# Create datasets with different feature sets
df_basic = df[basic_features]
df_intermediate = df[intermediate_features]
df_advanced = df[advanced_features]

print(f"Basic feature set size: {len(basic_features)}")
print(f"Intermediate feature set size: {len(intermediate_features)}")
print(f"Advanced feature set size: {len(advanced_features)}")

Basic feature set size: 10
Intermediate feature set size: 21
Advanced feature set size: 31


## 9. Final Datasets Preparation

In [11]:
# Create complete transformed dataset (with all features)
# This combines the engineered features with the scaled versions

# Create a master dataset with all engineered features
df_transformed = df_encoded.copy()

# Add scaled versions of numerical features with suffix
for col in numerical_features:
    df_transformed[f"{col}_scaled"] = df_scaled[col]
    df_transformed[f"{col}_minmax"] = df_minmax[col]

print(f"Original data shape: {df.shape}")
print(f"Transformed data shape: {df_transformed.shape}")
print(f"Number of features added: {df_transformed.shape[1] - df.shape[1]}")

Original data shape: (793, 77)
Transformed data shape: (793, 112)
Number of features added: 35


In [21]:
# Save the datasets for modeling

# Save complete transformed dataset
df_transformed.to_csv("../../data/processed/weather/NewYork_Weather 2023-01-01 to 2025-03-03_transformed_all.csv", index=False)

# Save feature sets of different complexity
df_basic.to_csv("../../data/processed/weather/NewYork_Weather 2023-01-01 to 2025-03-03_features_basic.csv", index=False)
df_intermediate.to_csv("../../data/processed/weather/NewYork_Weather 2023-01-01 to 2025-03-03_features_intermediate.csv", index=False)
df_advanced.to_csv("../../data/processed/weather/NewYork_Weather 2023-01-01 to 2025-03-03_features_advanced.csv", index=False)

# Also save the StandardScaler and MinMaxScaler objects for later use
import joblib
joblib.dump(scaler, "../../models/NewYork_Weather 2023-01-01 to 2025-03-03_standard_scaler.pkl")
joblib.dump(min_max_scaler, "../../models/NewYork_Weather 2023-01-01 to 2025-03-03_minmax_scaler.pkl")

print("All datasets saved successfully!")

All datasets saved successfully!


## 10. Summary

In this notebook, we've performed comprehensive data transformation on the weather dataset:

1. Engineered temporal features including month, day of week, and season
2. Created weather condition indicators for different weather types
3. Developed cyclic features to handle periodicity in time variables
4. Calculated rolling statistics to capture trends and patterns
5. Applied scaling and normalization to numerical features
6. Performed one-hot encoding for categorical variables
7. Created different feature sets for various modeling needs
8. Generated a complete transformed dataset with all features

The resulting datasets are now ready for use in traffic prediction modeling. The different feature sets (basic, intermediate, advanced) provide flexibility for model complexity trade-offs.