# Imports

In [13]:
import pandas as pd


## Load Dataset

In [14]:
df = pd.read_csv("pollution_traffic_weather_dataset.csv")

# Parse and sort by date
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# Handle missing values
missing_counts = df.isna().sum()
if missing_counts.sum() > 0:
    print("Missing Values", missing_counts.sum())
    df = df.dropna().reset_index(drop=True)
else:
    print("No missing values")

No missing values


In [15]:
pollutant_targets = ["Ozone", "NO2", "PM2.5", "CO"]

traffic_main = "traffic_daily_total"
traffic_extra = [
    "traffic_segments_observed",
    "traffic_daily_max",
    "traffic_daily_mean_segment",
]

meteo_for_interactions = {
    "temp": "temperature_2m_max",
    "wind": "wind_speed_10m_max_(km/h)",
    "precip": "precipitation_sum_(mm)",
    "humidity": "relative_humidity_2m_max_(%)",
}

meteo_controls = [
    "daylight_duration_(s)",
    "sunshine_duration_(s)",
    "rain_sum_(mm)",
    "snowfall_sum_(cm)",
    "wind_direction_10m_dominant_(°)",
    "et0_fao_evapotranspiration_(mm)",
    "shortwave_radiation_sum_(mj/m²)",
    "dew_point_2m_max",
    "cloud_cover_max_(%)",
    "cloud_cover_mean_(%)",
    "pressure_msl_max_(hpa)",
    "surface_pressure_max_(hpa)",
    "vapour_pressure_deficit_max_(kpa)",
]


**Add Confounders**

In [16]:
# Weekend indicator
df["is_weekend"] = df["Date"].dt.weekday >= 4

# Season indicators
month = df["Date"].dt.month
df["season_spring"] = month.isin([3, 4, 5])
df["season_summer"] = month.isin([6, 7, 8])
df["season_fall"]   = month.isin([9, 10, 11])
# Winter is reference (Dec–Feb)

# COVID period indicator (2020–2021)
start_covid = pd.Timestamp("2020-03-22")
end_covid   = pd.Timestamp("2021-06-21")

df["is_covid"] = df["Date"].between(start_covid, end_covid).astype(int)


bool_cols = ["is_weekend", "season_spring", "season_summer", "season_fall", "is_covid"]
df[bool_cols] = df[bool_cols].astype(int)



In [17]:
confounder_features = ["season_spring", "season_summer", "season_fall", "is_weekend", "is_covid"]

cont_features_core = [traffic_main] + list(meteo_for_interactions.values())
cont_features_extra = traffic_extra + meteo_controls

# All continuous predictors to scale later (you can trim this list if it’s too many)
cont_features_all = cont_features_core + cont_features_extra

print("Targets:", pollutant_targets)
print("Core continuous features:", cont_features_core)
print("Extra continuous features:", cont_features_extra)
print("Confounders:", confounder_features)


Targets: ['Ozone', 'NO2', 'PM2.5', 'CO']
Core continuous features: ['traffic_daily_total', 'temperature_2m_max', 'wind_speed_10m_max_(km/h)', 'precipitation_sum_(mm)', 'relative_humidity_2m_max_(%)']
Extra continuous features: ['traffic_segments_observed', 'traffic_daily_max', 'traffic_daily_mean_segment', 'daylight_duration_(s)', 'sunshine_duration_(s)', 'rain_sum_(mm)', 'snowfall_sum_(cm)', 'wind_direction_10m_dominant_(°)', 'et0_fao_evapotranspiration_(mm)', 'shortwave_radiation_sum_(mj/m²)', 'dew_point_2m_max', 'cloud_cover_max_(%)', 'cloud_cover_mean_(%)', 'pressure_msl_max_(hpa)', 'surface_pressure_max_(hpa)', 'vapour_pressure_deficit_max_(kpa)']
Confounders: ['season_spring', 'season_summer', 'season_fall', 'is_weekend', 'is_covid']


Create Final Dataset

In [18]:
# Choose which columns to keep in the final CSV
final_cols = (
    ["Date"] +
    pollutant_targets +                # Ozone, NO2, PM2.5, CO
    [traffic_main] +                   # main traffic
    list(meteo_for_interactions.values()) +  # temp, wind, precip, humidity
    meteo_controls +                   # other meteo controls
    confounder_features                # seasons, weekend, covid
)

# Keep only columns that actually exist (defensive)
final_cols = [c for c in final_cols if c in df.columns]

df_final = df[final_cols].copy()

# Save to CSV
out_name = "pollution_traffic_weather_final.csv"
df_final.to_csv(out_name, index=False)




pollution_traffic_weather_final.csv (2493, 28)
        Date  Ozone   NO2  PM2.5   CO  traffic_daily_total  \
0 2012-01-22  0.025  30.9   13.7  0.5                12548   
1 2012-01-23  0.022  40.8   12.9  0.7                42888   
2 2012-01-24  0.015  30.6   20.6  0.7                38133   
3 2012-01-25  0.020  30.6   14.8  0.5                38739   
4 2012-01-26  0.007  29.3   15.8  0.6                38134   

   temperature_2m_max  wind_speed_10m_max_(km/h)  precipitation_sum_(mm)  \
0                -1.5                       15.8                     0.0   
1                 8.5                       15.6                     1.3   
2                10.4                       15.5                     2.3   
3                 6.5                       13.5                     0.0   
4                 3.2                        9.7                     3.4   

   relative_humidity_2m_max_(%)  ...  cloud_cover_max_(%)  \
0                          87.0  ...                100.0   
1

# Split Data

In [26]:
import pandas as pd

df = pd.read_csv("pollution_traffic_weather_final.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# Train: 2012–2020 (unchanged)
train_end = pd.Timestamp("2020-12-31")

# Test: 2021–2023 (no 2024)
test_end = pd.Timestamp("2023-12-31")

train_mask = df["Date"] <= train_end
test_mask  = (df["Date"] > train_end) & (df["Date"] <= test_end)

df_train = df.loc[train_mask].reset_index(drop=True)
df_test  = df.loc[test_mask].reset_index(drop=True)

print("Train period:", df_train["Date"].min(), "->", df_train["Date"].max())
print("Test period:",  df_test["Date"].min(),  "->", df_test["Date"].max())
print("Train size:", len(df_train), "Test size:", len(df_test))

# Overwrite / save new split (unscaled)
df_train.to_csv("pollution_train_2012_2020.csv", index=False)
df_test.to_csv("pollution_test_2021_2023.csv", index=False)


Train period: 2012-01-22 00:00:00 -> 2020-11-13 00:00:00
Test period: 2021-01-04 00:00:00 -> 2023-11-19 00:00:00
Train size: 1897 Test size: 452


# Scaling

In [29]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

df_train = pd.read_csv("pollution_train_2012_2020.csv")
df_test  = pd.read_csv("pollution_test_2021_2023.csv")

pollutant_targets = ["Ozone", "NO2", "PM2.5", "CO"]

traffic_main = "traffic_daily_total"
traffic_extra = [
    "traffic_segments_observed",
    "traffic_daily_max",
    "traffic_daily_mean_segment",
]

meteo_for_interactions = {
    "temp": "temperature_2m_max",
    "wind": "wind_speed_10m_max_(km/h)",
    "precip": "precipitation_sum_(mm)",
    "humidity": "relative_humidity_2m_max_(%)",
}

meteo_controls = [
    "daylight_duration_(s)",
    "sunshine_duration_(s)",
    "rain_sum_(mm)",
    "snowfall_sum_(cm)",
    "wind_direction_10m_dominant_(°)",
    "et0_fao_evapotranspiration_(mm)",
    "shortwave_radiation_sum_(mj/m²)",
    "dew_point_2m_max",
    "cloud_cover_max_(%)",
    "cloud_cover_mean_(%)",
    "pressure_msl_max_(hpa)",
    "surface_pressure_max_(hpa)",
    "vapour_pressure_deficit_max_(kpa)",
]

confounder_features = ["season_spring", "season_summer", "season_fall", "is_weekend", "is_covid"]

cont_features_core  = [traffic_main] + list(meteo_for_interactions.values())
cont_features_extra = meteo_controls
cont_features_all = cont_features_core + cont_features_extra

# --- fit RobustScaler on train continues features only ---

scaler = RobustScaler()
scaler.fit(df_train[cont_features_all])

train_scaled = pd.DataFrame(
    scaler.transform(df_train[cont_features_all]),
    columns=cont_features_all,
    index=df_train.index,
)

test_scaled = pd.DataFrame(
    scaler.transform(df_test[cont_features_all]),
    columns=cont_features_all,
    index=df_test.index,
)

# --- Keep confounders and targets as-is, and recombine ---

df_train_scaled = pd.concat(
    [df_train[["Date"] + pollutant_targets + confounder_features].reset_index(drop=True),
        train_scaled.reset_index(drop=True),],
    axis=1,)

df_test_scaled = pd.concat(
    [df_test[["Date"] + pollutant_targets + confounder_features].reset_index(drop=True),
        test_scaled.reset_index(drop=True),],
    axis=1,)

# --- Save scaled train/test for all models to reuse ---

df_train_scaled.to_csv("pollution_train_2012_2020_scaled.csv", index=False)
df_test_scaled.to_csv("pollution_test_2021_2023_scaled.csv", index=False)

print(df_train_scaled.head())
print(df_test_scaled.head())


         Date  Ozone   NO2  PM2.5   CO  season_spring  season_summer  \
0  2012-01-22  0.025  30.9   13.7  0.5              0              0   
1  2012-01-23  0.022  40.8   12.9  0.7              0              0   
2  2012-01-24  0.015  30.6   20.6  0.7              0              0   
3  2012-01-25  0.020  30.6   14.8  0.5              0              0   
4  2012-01-26  0.007  29.3   15.8  0.6              0              0   

   season_fall  is_weekend  is_covid  ...  snowfall_sum_(cm)  \
0            0           1         0  ...               0.00   
1            0           0         0  ...               0.00   
2            0           0         0  ...               0.00   
3            0           0         0  ...               0.00   
4            0           0         0  ...               0.35   

   wind_direction_10m_dominant_(°)  et0_fao_evapotranspiration_(mm)  \
0                         -1.26875                        -0.680608   
1                         -0.81875      

## Create Interaction Terms

In [30]:
import pandas as pd

# Load scaled train/test
df_train = pd.read_csv("pollution_train_2012_2020_scaled.csv")
df_test  = pd.read_csv("pollution_test_2021_2023_scaled.csv")

# Define columns
traffic_col = "traffic_daily_total"
temp_col    = "temperature_2m_max"
wind_col    = "wind_speed_10m_max_(km/h)"
precip_col  = "precipitation_sum_(mm)"
hum_col     = "relative_humidity_2m_max_(%)"

# --- create interactions on TRAIN ---
df_train["int_traffic_temp"]     = df_train[traffic_col] * df_train[temp_col]
df_train["int_traffic_wind"]     = df_train[traffic_col] * df_train[wind_col]
df_train["int_traffic_precip"]   = df_train[traffic_col] * df_train[precip_col]
df_train["int_traffic_humidity"] = df_train[traffic_col] * df_train[hum_col]

# --- create interactions on TEST (using the same scaled features) ---
df_test["int_traffic_temp"]     = df_test[traffic_col] * df_test[temp_col]
df_test["int_traffic_wind"]     = df_test[traffic_col] * df_test[wind_col]
df_test["int_traffic_precip"]   = df_test[traffic_col] * df_test[precip_col]
df_test["int_traffic_humidity"] = df_test[traffic_col] * df_test[hum_col]

# Save updated datasets with interaction terms
df_train.to_csv("pollution_train_2012_2020_scaled_interactions.csv", index=False)
df_test.to_csv("pollution_test_2021_2023_scaled_interactions.csv", index=False)

print(df_train[[
    "traffic_daily_total", temp_col, "int_traffic_temp",
    wind_col, "int_traffic_wind",
    precip_col, "int_traffic_precip",
    hum_col, "int_traffic_humidity"
]].head())


   traffic_daily_total  temperature_2m_max  int_traffic_temp  \
0            -0.253576            -1.17500          0.297952   
1             0.810238            -0.55000         -0.445631   
2             0.643513            -0.43125         -0.277515   
3             0.664762            -0.67500         -0.448714   
4             0.643548            -0.88125         -0.567127   

   wind_speed_10m_max_(km/h)  int_traffic_wind  precipitation_sum_(mm)  \
0                  -0.204545          0.051868                0.000000   
1                  -0.227273         -0.184145                0.565217   
2                  -0.238636         -0.153566                1.000000   
3                  -0.465909         -0.309718                0.000000   
4                  -0.897727         -0.577731                1.478261   

   int_traffic_precip  relative_humidity_2m_max_(%)  int_traffic_humidity  
0           -0.000000                     -0.200000              0.050715  
1            0.457