RANDOM FOREST

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load data
traffic = pd.read_csv("trafficData.csv").head(1000)  # sample size for faster processing
stations = pd.read_csv("trafficStations.csv")
weather = pd.read_csv("weatherStations.csv")

# Melt traffic data to long format
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'year', 'month', 'day', 'day_of_week', 'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

traffic_long['hour'] = traffic_long['hour'].str.extract(r'hour_(\d+)').astype(int)
traffic_long['date'] = pd.to_datetime(traffic_long[['year', 'month', 'day']])
traffic_long['datetime'] = traffic_long['date'] + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge traffic with station and weather data
traffic_weather = traffic_long.merge(stations, on='station_key', how='left')
weather['date'] = pd.to_datetime(weather[['year', 'month', 'day']])
weather_filtered = weather[['date', 'station_key', 'rainfall', 'min_temp', 'max_temp', 'wind_speed']]
merged = traffic_weather.merge(weather_filtered, on=['station_key', 'date'], how='left')

# Drop rows with missing values
merged = merged.dropna()

# Create a baseline prediction (average traffic)
merged['baseline'] = merged.groupby(['station_key', 'hour', 'day_of_week'])['traffic_count'].transform('mean')

# Calculate the deviation due to weather
merged['delta_traffic'] = merged['traffic_count'] - merged['baseline']

# Train a model to predict delta_traffic from weather
features = merged[['rainfall', 'min_temp', 'max_temp', 'wind_speed']]
target = merged['delta_traffic']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Subproblem B – RMSE (Random Forest): {rmse:.2f}")


SVR

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load datasets (sample for speed)
traffic = pd.read_csv("trafficData.csv").head(1000)
stations = pd.read_csv("trafficStations.csv")
weather = pd.read_csv("weatherStations.csv")

# Melt wide traffic format into long
hour_cols = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'year', 'month', 'day', 'day_of_week', 'public_holiday', 'school_holiday'],
    value_vars=hour_cols,
    var_name='hour',
    value_name='traffic_count'
)
traffic_long['hour'] = traffic_long['hour'].str.extract(r'hour_(\d+)').astype(int)
traffic_long['date'] = pd.to_datetime(traffic_long[['year', 'month', 'day']])
traffic_long['datetime'] = traffic_long['date'] + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station + weather data
merged = traffic_long.merge(stations, on='station_key', how='left')
weather['date'] = pd.to_datetime(weather[['year', 'month', 'day']])
weather_filtered = weather[['date', 'station_key', 'rainfall', 'min_temp', 'max_temp', 'wind_speed']]
merged = merged.merge(weather_filtered, on=['station_key', 'date'], how='left')

# Drop missing
merged = merged.dropna()

# Compute baseline and deviation
merged['baseline'] = merged.groupby(['station_key', 'hour', 'day_of_week'])['traffic_count'].transform('mean')
merged['delta_traffic'] = merged['traffic_count'] - merged['baseline']

# Define features and target
X = merged[['rainfall', 'min_temp', 'max_temp', 'wind_speed']]
y = merged['delta_traffic']

# Standardize input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit SVR model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = svr_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"SVR RMSE (Sub-problem B): {rmse:.2f}")


XGBoost

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load datasets (using a manageable sample for speed)
traffic = pd.read_csv("trafficData.csv").head(1000)
stations = pd.read_csv("trafficStations.csv")
weather = pd.read_csv("weatherStations.csv")

# Melt wide traffic format to long format
hour_cols = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'year', 'month', 'day', 'day_of_week', 'public_holiday', 'school_holiday'],
    value_vars=hour_cols,
    var_name='hour',
    value_name='traffic_count'
)
traffic_long['hour'] = traffic_long['hour'].str.extract(r'hour_(\d+)').astype(int)
traffic_long['date'] = pd.to_datetime(traffic_long[['year', 'month', 'day']])
traffic_long['datetime'] = traffic_long['date'] + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station and weather
merged = traffic_long.merge(stations, on='station_key', how='left')
weather['date'] = pd.to_datetime(weather[['year', 'month', 'day']])
weather_filtered = weather[['date', 'station_key', 'rainfall', 'min_temp', 'max_temp', 'wind_speed']]
merged = merged.merge(weather_filtered, on=['station_key', 'date'], how='left')

# Remove missing data
merged = merged.dropna()

# Calculate baseline and deviation
merged['baseline'] = merged.groupby(['station_key', 'hour', 'day_of_week'])['traffic_count'].transform('mean')
merged['delta_traffic'] = merged['traffic_count'] - merged['baseline']

# Feature matrix and target
X = merged[['rainfall', 'min_temp', 'max_temp', 'wind_speed']]
y = merged['delta_traffic']

# Optional: scaling helps but not strictly required for tree models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"XGBoost RMSE (Sub-problem B): {rmse:.2f}")


MLP

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load datasets (using sample size for faster processing)
traffic = pd.read_csv("trafficData.csv").head(1000)
stations = pd.read_csv("trafficStations.csv")
weather = pd.read_csv("weatherStations.csv")

# Convert traffic data from wide to long format
hour_cols = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'year', 'month', 'day', 'day_of_week', 'public_holiday', 'school_holiday'],
    value_vars=hour_cols,
    var_name='hour',
    value_name='traffic_count'
)
traffic_long['hour'] = traffic_long['hour'].str.extract(r'hour_(\d+)').astype(int)
traffic_long['date'] = pd.to_datetime(traffic_long[['year', 'month', 'day']])
traffic_long['datetime'] = traffic_long['date'] + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with stations and weather
merged = traffic_long.merge(stations, on='station_key', how='left')
weather['date'] = pd.to_datetime(weather[['year', 'month', 'day']])
weather_filtered = weather[['date', 'station_key', 'rainfall', 'min_temp', 'max_temp', 'wind_speed']]
merged = merged.merge(weather_filtered, on=['station_key', 'date'], how='left')

# Drop rows with missing values
merged = merged.dropna()

# Calculate baseline and deviation
merged['baseline'] = merged.groupby(['station_key', 'hour', 'day_of_week'])['traffic_count'].transform('mean')
merged['delta_traffic'] = merged['traffic_count'] - merged['baseline']

# Select features and target
X = merged[['rainfall', 'min_temp', 'max_temp', 'wind_speed']]
y = merged['delta_traffic']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# MLP Regressor configuration
mlp_model = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu',
                         solver='adam', max_iter=1000, random_state=42)

# Train and predict
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)

# Evaluate using RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MLP RMSE (Sub-problem B): {rmse:.2f}")
