In [None]:
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
    %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

from utils import config

settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn
Added the following directory to the PYTHONPATH: /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


In [2]:
import datetime
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import hopsworks
import json
from utils import airquality
from scipy.spatial.distance import cdist
import matplotlib.colors as mcolors
import os

warnings.filterwarnings("ignore")

In [3]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store()

secrets = hopsworks.get_secrets_api()
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)
weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

2025-11-18 12:06:42,036 INFO: Initializing external client
2025-11-18 12:06:42,036 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-18 12:06:43,582 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279179


Set SENSOR_CSV_FILE in .env with the relative path to a sensor to process it, or leave it unset to process all sensors in the `data` folder

In [4]:
sensor_csv_file = getattr(settings, 'SENSOR_CSV_FILE', None)

if sensor_csv_file:
    # Read one secret for single sensor mode
    _, _, _, _, _, sensor_id = airquality.read_sensor_data(sensor_csv_file)
    secret_name = f"SENSOR_LOCATION_JSON_{sensor_id}"
    location_str = secrets.get_secret(secret_name).value
    locations = {sensor_id: json.loads(location_str)}
else:
    # Read all individual secrets in batch mode
    all_secrets = secrets.get_secrets()
    locations = {}
    for secret in all_secrets:
        if secret.name.startswith("SENSOR_LOCATION_JSON_"):
            sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
            location_str = secrets.get_secret(secret.name).value
            if location_str:
                locations[sensor_id] = json.loads(location_str)


## Helper Methods

In [5]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)
weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

today = datetime.datetime.now().replace(tzinfo=None)
past_date = today - datetime.timedelta(days=4)

In [6]:
batch_weather = weather_fg.filter(weather_fg.date >= past_date).read()
batch_weather["date"] = pd.to_datetime(batch_weather["date"]).dt.tz_localize(None)
print(batch_weather.info())

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.65s) 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         160 non-null    datetime64[us]
 1   temperature_2m_mean          160 non-null    float32       
 2   precipitation_sum            160 non-null    float32       
 3   wind_speed_10m_max           160 non-null    float32       
 4   wind_direction_10m_dominant  160 non-null    float32       
 5   city                         160 non-null    object        
 6   sensor_id                    160 non-null    object        
 7   latitude                     160 non-null    float64       
 8   longitude                    160 non-null    float64       
dtypes: datetime64[us](1), float32(4), float64(2), object(2)
memory usage: 8.9+ KB
None


In [7]:
try:
    batch_airquality = air_quality_fg.filter(air_quality_fg.date >= past_date).read()
    batch_airquality["date"] = pd.to_datetime(batch_airquality["date"]).dt.tz_localize(None)
except Exception:
    batch_airquality = pd.DataFrame()
print(batch_airquality.info())

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.79s) 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             62 non-null     datetime64[us]
 1   pm25             62 non-null     float64       
 2   sensor_id        62 non-null     object        
 3   street           62 non-null     object        
 4   city             62 non-null     object        
 5   country          62 non-null     object        
 6   feed_url         62 non-null     object        
 7   pm25_rolling_3d  62 non-null     float64       
 8   pm25_lag_1d      62 non-null     float64       
 9   pm25_lag_2d      61 non-null     float64       
 10  pm25_lag_3d      61 non-null     float64       
 11  pm25_nearby_avg  62 non-null     float64       
dtypes: datetime64[us](1), float64(6), object(5)
memory usage: 5.9+ K

## Predictions

In [9]:
mr = project.get_model_registry()

MODEL_NAME_TEMPLATE = "air_quality_xgboost_model_{sensor_id}"

# model, model_dir, features
retrieved_models = {}

for sensor_id in locations.keys():
    model_name = MODEL_NAME_TEMPLATE.format(sensor_id=sensor_id)
    retrieved_model = None

    available_models = mr.get_models(name=model_name)
    if available_models:
        retrieved_model = max(available_models, key=lambda model: model.version)

    saved_model_dir = retrieved_model.download()
    xgb_model = XGBRegressor()
    xgb_model.load_model(saved_model_dir + "/model.json")
    booster = xgb_model.get_booster()

    retrieved_models[sensor_id] = retrieved_model, xgb_model, booster.feature_names

Downloading: 0.000%|          | 0/552706 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/24254 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/130100 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/50600 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/50373 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/567150 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/22229 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/122744 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/48144 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/49312 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/552835 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/21757 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/89717 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/44046 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/45292 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/516859 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/21920 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/114113 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/51229 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/45121 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/583212 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/31925 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/121868 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/49062 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/50807 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/590195 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/26237 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/123070 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/50920 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/48289 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/568599 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/31106 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/112921 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/44260 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/44429 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/583709 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/30789 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/121774 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/48507 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/47427 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/466146 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/20922 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/124645 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/45332 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/54405 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/550454 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/21916 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/116686 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/44302 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/46407 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/560662 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/30702 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/119179 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/46618 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/46634 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/570739 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/26538 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/57954 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/51543 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/43047 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/267692 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/31549 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/128642 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/51080 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/47945 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/519235 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/23889 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/47888 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/48189 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/42247 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/557209 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/22094 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/114317 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/50461 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/48063 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/555802 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/26136 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/101037 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 3 files)... DONE

## Prediction

In [10]:
# Merge historical data with weather data
batch_data = pd.merge(batch_weather, batch_airquality, on=["date", "sensor_id"], how="left")
batch_data = batch_data.sort_values(["sensor_id", "date"])

feature_cols = [
    "pm25_rolling_3d",
    "pm25_lag_1d",
    "pm25_lag_2d",
    "pm25_lag_3d",
    "pm25_nearby_avg",
]

batch_data["predicted_pm25"] = np.nan
batch_data["days_before_forecast_day"] = np.nan
for col in feature_cols:
    batch_data[f"predicted_{col}"] = np.nan

forecast_days = (
    batch_data.loc[batch_data["pm25"].isna() & (batch_data["date"] >= today.strftime("%Y-%m-%d")), "date"]
    .dropna()
    .sort_values()
    .unique()
)
for target_day in forecast_days:
    # context with all sensors up to current day
    window = batch_data.loc[batch_data["date"] <= target_day].copy()
    day_rows = window[(window["date"] == target_day) & window["pm25"].isna()]

    for _, row in day_rows.iterrows():
        sensor_id = row["sensor_id"]

        _, xgb_model, model_features = retrieved_models[sensor_id]
        features = (row.reindex(model_features).to_frame().T.apply(pd.to_numeric, errors="coerce"))
        y_hat = xgb_model.predict(features)[0]

        idx = batch_data.index[(batch_data["sensor_id"] == sensor_id) & (batch_data["date"] == target_day)][0]
        batch_data.at[idx, "pm25"] = y_hat
        batch_data.at[idx, "predicted_pm25"] = y_hat
        batch_data.at[idx, "days_before_forecast_day"] = (target_day - today).days + 1

    # recompute features for all sensors now that this days values exist
    temp_df = batch_data.loc[batch_data["date"] <= target_day].copy()
    temp_df = airquality.add_rolling_window_feature(
        temp_df, window_days=3, column="pm25", new_column="pm25_rolling_3d"
    )
    temp_df = airquality.add_lagged_features(temp_df, column="pm25", lags=[1, 2, 3])
    temp_df = airquality.add_nearby_sensor_feature(
        temp_df,
        locations,
        column="pm25",
        n_closest=3,
        new_column="pm25_nearby_avg",
    )

    current_rows = temp_df[temp_df["date"] == target_day]
    for _, row in current_rows.iterrows():
        sensor_id = row["sensor_id"]
        mask = (batch_data["sensor_id"] == sensor_id) & (batch_data["date"] == target_day)
        if mask.any():
            for col in feature_cols:
                batch_data.loc[mask, f"predicted_{col}"] = row[col]

predictions = batch_data.loc[
    batch_data["predicted_pm25"].notna(),
    ["date", "sensor_id", "predicted_pm25", "days_before_forecast_day"]
    + [f"predicted_{col}" for col in feature_cols],
].reset_index(drop=True)
batch_data.loc[batch_data["date"] > today, "pm25"] = np.nan

In [11]:
# Save predictions to csv
batch_data.to_csv(f"{root_dir}/models/predictions.csv", columns=batch_data.columns, index=False)

In [12]:
forecast_paths = []

for sensor_id, location in locations.items():
    sensor_forecast = predictions[predictions["sensor_id"] == sensor_id].copy()

    city, street = location["city"], location["street"]
    forecast_path = f"{root_dir}/models/{sensor_id}/images/forecast.png"
    Path(forecast_path).parent.mkdir(parents=True, exist_ok=True)

    plt = airquality.plot_air_quality_forecast(
        location["city"],
        location["street"],
        sensor_forecast,
        forecast_path,
        hindcast=False,
    )
    plt.close()
    forecast_paths.append((sensor_id, forecast_path))

dataset_api = project.get_dataset_api()
today_short = today.strftime("%Y-%m-%d")
if not dataset_api.exists("Resources/airquality"):
    dataset_api.mkdir("Resources/airquality")

for sensor_id, forecast_path in forecast_paths:
    dataset_api.upload(
        forecast_path,
        f"Resources/airquality/{sensor_id}_{today_short}_forecast.png",
        overwrite=True,
    )
print(f"Forecast plots available in Hopsworks under {project.get_url()}/settings/fb/path/Resources/airquality")

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/154549/images/forecast.png: 0.000%|       …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60541/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/79750/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/404209/images/forecast.png: 0.000%|       …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/59095/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60535/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60853/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/88372/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/194215/images/forecast.png: 0.000%|       …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/69628/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/61714/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/65146/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/112672/images/forecast.png: 0.000%|       …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/59893/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/69724/images/forecast.png: 0.000%|        …

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/81505/images/forecast.png: 0.000%|        …

Forecast plots available in Hopsworks under https://c.app.hopsworks.ai:443/p/1279179/settings/fb/path/Resources/airquality


In [13]:
# Insert predictions into monitoring feature group
monitor_fg = fs.get_or_create_feature_group(
    name="aq_predictions",
    description="Air Quality prediction monitoring",
    version=1,
    primary_key=["sensor_id", "date", "days_before_forecast_day"],
    event_time="date",
)
monitor_fg.insert(predictions, wait=True)


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279179/fs/1265797/fg/1721948


Uploading Dataframe: 100.00% |██████████| Rows 96/96 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: aq_predictions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279179/jobs/named/aq_predictions_1_offline_fg_materialization/executions
2025-11-18 12:09:15,261 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-18 12:09:18,454 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-18 12:10:47,485 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-18 12:10:47,649 INFO: Waiting for log aggregation to finish.
2025-11-18 12:10:56,233 INFO: Execution finished successfully.


(Job('aq_predictions_1_offline_fg_materialization', 'SPARK'), None)

## Prediction Hindcast: Comparing predicted with forecasted values (1-day prior forecast)


In [14]:
monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()
monitoring_df["date"] = pd.to_datetime(monitoring_df["date"]).dt.tz_localize(None)

air_quality_df = air_quality_fg.read()[["date", "sensor_id", "pm25"]]
air_quality_df["date"] = pd.to_datetime(air_quality_df["date"]).dt.tz_localize(None)

for sensor_id, location in locations.items():
    sensor_preds = monitoring_df[monitoring_df["sensor_id"] == sensor_id][["date", "predicted_pm25"]]
    merged = sensor_preds.merge(
        air_quality_df[air_quality_df["sensor_id"] == sensor_id][["date", "pm25"]],
        on="date",
        how="inner",
    ).sort_values("date")

    city, street = location["city"], location["street"]
    hindcast_path = f"{root_dir}/models/{sensor_id}/images/hindcast_prediction.png"
    Path(hindcast_path).parent.mkdir(parents=True, exist_ok=True)

    plt = airquality.plot_air_quality_forecast(
        city,
        street,
        merged if not merged.empty else sensor_preds.assign(pm25=np.nan),
        hindcast_path,
        hindcast=True,
    )
    plt.close()

    dataset_api.upload(
        hindcast_path,
        f"Resources/airquality/{sensor_id}_{today:%Y-%m-%d}_hindcast.png",
        overwrite=True,
    )

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.17s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.73s) 


Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/154549/images/hindcast_prediction.png: 0.0…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60541/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/79750/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/404209/images/hindcast_prediction.png: 0.0…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/59095/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60535/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/60853/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/88372/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/194215/images/hindcast_prediction.png: 0.0…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/69628/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/61714/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/65146/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/112672/images/hindcast_prediction.png: 0.0…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/59893/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/69724/images/hindcast_prediction.png: 0.00…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/81505/images/hindcast_prediction.png: 0.00…

## IDW Heatmap

In [15]:
def idw_interpolation(points, values, grid_points, lon_mesh, power=2):
    distances = cdist(grid_points, points)
    distances = np.where(distances == 0, 1e-10, distances)
    weights = 1.0 / (distances ** power)
    weights_sum = np.sum(weights, axis=1)
    interpolated = np.sum(weights * values, axis=1) / weights_sum
    return interpolated.reshape(lon_mesh.shape)

In [None]:
def plot_pm25_idw_heatmap(
    predictions: pd.DataFrame,
    locations: dict,
    forecast_date: datetime.datetime,
    path: str,
    grid_bounds=(11.4, 57.15, 12.5, 58.25),
    grid_resolution=800,
    power=2,
):

    df_day = predictions[predictions["date"] == forecast_date].copy()

    sensor_coords = np.array([[locations[sid]["longitude"], locations[sid]["latitude"]]
                              for sid in df_day["sensor_id"].unique() if sid in locations])

    pm25_column = "predicted_pm25"
    if df_day["predicted_pm25"].isna().any():
        pm25_column = "pm25"

    pm25_values = np.array([df_day[df_day["sensor_id"] == sid][pm25_column].iloc[0]
                            for sid in df_day["sensor_id"].unique() if sid in locations])

    min_lon, min_lat, max_lon, max_lat = grid_bounds

    lon_grid = np.linspace(min_lon, max_lon, grid_resolution)
    lat_grid = np.linspace(min_lat, max_lat, grid_resolution)
    lon_mesh, lat_mesh = np.meshgrid(lon_grid, lat_grid)
    grid_points = np.column_stack([lon_mesh.ravel(), lat_mesh.ravel()])

    idw_result = idw_interpolation(sensor_coords, pm25_values, grid_points, lon_mesh, power=power)

    default_levels = np.array([0, 12, 35, 55, 150, 250, 500])
    category_colors = ["#00e400", "#7de400", "#ffff00", "#ffb000", "#ff7e00", "#ff4000", "#ff0000", "#c0007f", "#8f3f97", "#7e0023"]
    vmin, vmax = default_levels[0], 150
    
    clipped = np.clip(idw_result, vmin, vmax)
    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(
        clipped,
        extent=(min_lon, max_lon, min_lat, max_lat),
        origin="lower",
        cmap=mcolors.LinearSegmentedColormap.from_list("aqi", category_colors, N=512),
        vmin=vmin,
        vmax=vmax,
        alpha=0.5,
    )
    ax.set_xlim(min_lon, max_lon)
    ax.set_ylim(min_lat, max_lat)
    ax.axis("off")

    fig.savefig(path, dpi=300, bbox_inches="tight", pad_inches=0, transparent=True)
    plt.close(fig)
    plt.close(fig)

In [17]:
interpolation_dir = f"{root_dir}/models/interpolation"
if not os.path.exists(interpolation_dir):
    os.mkdir(interpolation_dir)

today_short = today.strftime("%Y-%m-%d")

interpolation_df = batch_data[batch_data["date"] >= today_short]
for i, forecast_date in enumerate(sorted(interpolation_df["date"].unique())):
    forecast_date_short = forecast_date.strftime("%Y-%m-%d")
    output_png = f"{interpolation_dir}/forecast_interpolation_{i}d.png"
    
    plot_pm25_idw_heatmap(
        interpolation_df,
        locations,
        forecast_date,
        output_png,
    )
    dataset_api.upload(
        output_png,
        f"Resources/airquality/interpolation_{today_short}_{forecast_date_short}.png",
        overwrite=True,
    )

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_0d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_1d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_2d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_3d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_4d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_5d.pn…

Uploading /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn/models/interpolation/forecast_interpolation_6d.pn…