In [15]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [16]:
# Load scaled train data (2012–2022) with interactions
train_path = "datasets/final_datasets/pollution_train_2012_2020_scaled_interactions.csv"
df_train = pd.read_csv(train_path, parse_dates=["Date"])

pollutant_targets = ["Ozone", "NO2", "PM2.5", "CO"]

confounder_features = ["season_spring", "season_summer", "season_fall", "is_weekend", "is_covid"]

core_features = [
    "traffic_daily_total",
    "temperature_2m_max",
    "wind_speed_10m_max_(km/h)",
    "precipitation_sum_(mm)",
    "relative_humidity_2m_max_(%)",
]

extra_features = [
    "daylight_duration_(s)",
    "sunshine_duration_(s)",
    "rain_sum_(mm)",
    "snowfall_sum_(cm)",
    "wind_direction_10m_dominant_(°)",
    "et0_fao_evapotranspiration_(mm)",
    "shortwave_radiation_sum_(mj/m²)",
    "dew_point_2m_max",
    "cloud_cover_max_(%)",
    "cloud_cover_mean_(%)",
    "pressure_msl_max_(hpa)",
    "surface_pressure_max_(hpa)",
    "vapour_pressure_deficit_max_(kpa)",
]

base_features = core_features + extra_features

interaction_features = [
    "int_traffic_temp",
    "int_traffic_wind",
    "int_traffic_precip",
    "int_traffic_humidity",
]


In [17]:
def tune_rf_for_target_interaction(
    df_train,
    target_col: str,
    interaction_choice: str | None,
    n_splits: int = 5,
    random_state: int = 0,
):
    """
    Tune RandomForestRegressor for a given pollutant target and interaction setting.
    interaction_choice:
        None -> no interaction feature
        one of interaction_features -> add only that feature
    """
    # Choose features
    if interaction_choice is None:
        feat_list = base_features
        interaction_label = "none"
    else:
        feat_list = base_features + [interaction_choice]
        interaction_label = interaction_choice

    X_train = df_train[feat_list].copy()
    y_train = df_train[target_col].copy()

    # Time-series cross-validation
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Hyperparameter grid (adjust as needed)
    param_grid = {
        "n_estimators": [100, 300],
        "max_depth": [5, 10, None],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
    }

    rf = RandomForestRegressor(random_state=random_state, n_jobs=-1)

    gsearch = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=tscv,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=0,
    )

    gsearch.fit(X_train, y_train)

    best_params = gsearch.best_params_
    best_score = -gsearch.best_score_  # convert back from negative RMSE

    result = {
        "target": target_col,
        "interaction": interaction_label,
        "features_used": feat_list,
        "best_params": best_params,
        "cv_rmse": best_score,
    }
    return result


In [18]:
all_results = []

for target in pollutant_targets:
    # 1) no interaction
    res_none = tune_rf_for_target_interaction(df_train, target_col=target, interaction_choice=None)
    all_results.append(res_none)

    # 2) each interaction separately
    for int_feat in interaction_features:
        res_int = tune_rf_for_target_interaction(df_train, target_col=target, interaction_choice=int_feat)
        all_results.append(res_int)

# Convert to DataFrame (flatten best_params for readability)
rows = []
for r in all_results:
    flat = {
        "target": r["target"],
        "interaction": r["interaction"],
        "cv_rmse": r["cv_rmse"],
    }
    for k, v in r["best_params"].items():
        flat[f"param_{k}"] = v
    rows.append(flat)

rf_tuning_df = pd.DataFrame(rows)
rf_tuning_df


KeyboardInterrupt: 

In [None]:
# CSV summary (one row per experiment)
rf_tuning_df.to_csv("rf_hyperparams_summary.csv", index=False)

# Optionally: save the raw list (with features_used etc.) as JSON
import json

with open("rf_hyperparams_full.json", "w") as f:
    json.dump(all_results, f, indent=2, default=str)