# 4. Batch Inference Pipeline

## 4.1. Setup

In [7]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, datetime, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks
import matplotlib.pyplot as plt
from urllib3.exceptions import ProtocolError  
from requests.exceptions import ConnectionError, Timeout
from confluent_kafka import KafkaException
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from scipy.spatial.distance import cdist
import matplotlib.colors as mcolors

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

#  Load settings 
settings = config.HopsworksSettings()
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25\notebooks\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-16 16:54:44,276 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-16 16:54:44,290 INFO: Initializing external client
2026-01-16 16:54:44,290 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-16 16:54:45,688 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184
2026-01-16 16:54:47,016 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-16 16:54:47,024 INFO: Initializing external client
2026-01-16 16:54:47,024 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-16 16:55:04,511 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


Repo management

In [8]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

Already in repo at c:\Users\krist\Documents\GitHub\pm25\notebooks\pm25-forecast-openmeteo-aqicn


In [9]:
today = datetime.today().date()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

## 4.3. Get Feature Groups

In [10]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 4.4. Sensor Location Loading 
Load sensor location metadata from Hopsworks secrets for all sensors.

In [11]:
all_secrets = secrets.get_secrets()
locations = {}
for secret in all_secrets:
    if secret.name.startswith("SENSOR_LOCATION_JSON_"):
        sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
        location_str = secrets.get_secret(secret.name).value
        if location_str:
            locations[sensor_id] = json.loads(location_str)
print(f"Retrieved locations for {len(locations)} sensors from Hopsworks Secrets Manager.")

Retrieved locations for 104 sensors from Hopsworks Secrets Manager.


## 4.5. Weather Data Loading
Fetch recent weather data from feature store and convert date formats

In [17]:
# First, check what data exists in the weather feature group
print("Checking weather feature group contents...")
all_weather = weather_fg.read()
print(f"Total weather records in feature group: {len(all_weather)}")

if len(all_weather) > 0:
    all_weather["date"] = pd.to_datetime(all_weather["date"]).dt.tz_localize(None)
    print(f"Date range in weather FG: {all_weather['date'].min()} to {all_weather['date'].max()}")
    print(f"Sample dates: {sorted(all_weather['date'].unique())[:10]}")
    print(f"Sensors: {all_weather['sensor_id'].nunique()}")
    
    # Filter for the date range we need
    past_date = today - timedelta(days=7)
    future_date = today + timedelta(days=7)
    batch_weather = all_weather[
        (all_weather["date"] >= past_date) & (all_weather["date"] <= future_date)
    ].copy()
    
    print(f"\nFiltered to {len(batch_weather)} records from {past_date} to {future_date}")
else:
    print("⚠️ Weather feature group is completely empty! Run notebook 2_feature_pipeline.ipynb")
    batch_weather = all_weather

Checking weather feature group contents...


ConnectionError: cannot receive data before headers

In [12]:
# Fetch weather data: historical (for context) + future (for forecasts)
past_date = today - timedelta(days=7)  # Get 7 days of historical data for feature engineering
future_date = today + timedelta(days=7)  # Get 7 days of future weather forecasts

try:
    batch_weather = weather_fg.filter(
        (weather_fg.date >= past_date) & (weather_fg.date <= future_date)
    ).read()
except Exception:
    batch_weather = weather_fg.read()
    batch_weather = batch_weather[
        (batch_weather["date"] >= past_date) & (batch_weather["date"] <= future_date)
    ]

batch_weather["date"] = pd.to_datetime(batch_weather["date"]).dt.tz_localize(None)
print(f"Retrieved {len(batch_weather)} weather records from {past_date} to {future_date}")
print(f"Date range: {batch_weather['date'].min()} to {batch_weather['date'].max()}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.63s) 
Retrieved 0 weather records from 2026-01-09 to 2026-01-23
Date range: NaT to NaT


In [13]:
print(batch_weather['date'].unique())

<DatetimeArray>
[]
Length: 0, dtype: datetime64[us]


## 4.6. Air Quality Data Loading
Fetch recent air quality with error handling for missing data.

In [14]:
try:
    batch_airquality = air_quality_fg.filter(air_quality_fg.date >= past_date).read()
    batch_airquality["date"] = pd.to_datetime(batch_airquality["date"]).dt.tz_localize(None)
except Exception:
    batch_airquality = pd.DataFrame()
print(f"Retrieved {len(batch_airquality)} air quality records from Hopsworks Feature Store.")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.44s) 
Retrieved 4 air quality records from Hopsworks Feature Store.


In [15]:
print(batch_airquality['date'].unique())

<DatetimeArray>
['2026-01-14 00:00:00', '2026-01-11 00:00:00', '2026-01-12 00:00:00',
 '2026-01-13 00:00:00']
Length: 4, dtype: datetime64[us]


## 4.7. Model Retrieval
Download trained XGBoost models from Hopsworks model registry for each sensor and extract feature names.

In [16]:
mr = project.get_model_registry()

MODEL_NAME_TEMPLATE = "air_quality_xgboost_model_{sensor_id}"

# model, model_dir, features
retrieved_models = {}

for sensor_id in locations.keys():
    model_name = MODEL_NAME_TEMPLATE.format(sensor_id=sensor_id)
    retrieved_model = None

    available_models = mr.get_models(name=model_name)
    if available_models:
        retrieved_model = max(available_models, key=lambda model: model.version)

    if retrieved_model is None:
        print(f"No model found for sensor {sensor_id}, skipping...")
        continue
    
    saved_model_dir = retrieved_model.download()  
    
    booster = xgb.Booster()
    booster.load_model(saved_model_dir + "/model.json")
    xgb_model = XGBRegressor()
    xgb_model._Booster = booster

    retrieved_models[sensor_id] = retrieved_model, xgb_model, booster.feature_names

Downloading: 0.000%|          | 0/511002 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/176555 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/52520 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/51288 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/18885 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/480452 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/248025 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/53953 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/24401 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 4 files)... DONE

Downloading: 0.000%|          | 0/398228 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/138989 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/47811 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/45161 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/18378 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/411719 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/162619 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/52630 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/50124 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/20135 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/441144 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/161596 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/48212 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/46211 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/24382 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/377173 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/129945 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/53647 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/39271 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/21237 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/420338 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/248434 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/61477 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/51423 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/22356 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/454654 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/205583 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/51465 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/48384 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/18612 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/507248 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/247574 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/47602 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/45191 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/18977 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/424645 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/246813 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/53814 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/21711 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 4 files)... DONE

Downloading: 0.000%|          | 0/467773 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/130447 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/53762 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

Downloading: 0.000%|          | 0/47174 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 4 files)... 

Downloading: 0.000%|          | 0/20559 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 5 files)... DONE

Downloading: 0.000%|          | 0/487706 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/181727 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/55673 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... 

KeyboardInterrupt: 

In [None]:
print(f"Retrieved {len(retrieved_models)} models.")

## 4.8. Batch Prediction Loop
Merge weather and air quality data, iteratively predict PM2.5 values for forecast days, update engineered features after each prediction, and store results

In [None]:
PREDICTION_CAP_MAX = 150.0  # Maximum reasonable PM2.5 value
PREDICTION_CAP_MIN = 0.0    # Minimum reasonable PM2.5 value

In [None]:
# Merge historical data with weather data
batch_data = pd.merge(batch_weather, batch_airquality, on=["date", "sensor_id"], how="left")
batch_data = batch_data.sort_values(["sensor_id", "date"])

In [None]:
feature_cols = [
    "pm25_rolling_3d",
    "pm25_lag_1d",
    "pm25_lag_2d",
    "pm25_lag_3d",
    "pm25_nearby_avg",
]

In [None]:
# Create a new column for pedicted pm2.5 values, fill with NaN for now
batch_data["predicted_pm25"] = np.nan
# Create a new column for days before forecast day, fill with NaN for now
batch_data["days_before_forecast_day"] = np.nan
# For each feature name in feature_cols list, 
# create a new column for predicted feature values, fill with NaN for now
for col in feature_cols:
    batch_data[f"predicted_{col}"] = np.nan
# Select all rows where pm25 is NaN and date is today or later
# drop any NaN date values, sort the dates in ascending order, get unique dates
# forecast days will be a list of dates for which pm2.5 predictions are needed
forecast_days = (
    batch_data.loc[batch_data["pm25"].isna() & 
                   (batch_data["date"] >= today.strftime("%Y-%m-%d")), "date"]
    .dropna()
    .sort_values()
    .unique()
)

In [None]:
# Track sensors processed to prevent duplicates and count progress
# sensors_processed = set()
# warning_count = 0
# MAX_WARNINGS = 3  # Reduced to minimize output noise
# predictions_made = 0

In [None]:
print(forecast_days)

In [None]:
for target_day in forecast_days:
    # context with all sensors up to current day
    window = batch_data.loc[batch_data["date"] <= target_day].copy()
    day_rows = window[(window["date"] == target_day) & window["pm25"].isna()]

    for _, row in day_rows.iterrows():
        sensor_id = row["sensor_id"]

        _, xgb_model, model_features = retrieved_models[sensor_id]
        features = (row.reindex(model_features).to_frame().T.apply(pd.to_numeric, errors="coerce"))
        y_hat = xgb_model.predict(features)[0]

        idx = batch_data.index[(batch_data["sensor_id"] == sensor_id) & (batch_data["date"] == target_day)][0]
        batch_data.at[idx, "pm25"] = y_hat
        batch_data.at[idx, "predicted_pm25"] = y_hat
        batch_data.at[idx, "days_before_forecast_day"] = (target_day - pd.Timestamp(today)).days + 1

    # recompute features for all sensors now that this days values exist
    temp_df = batch_data.loc[batch_data["date"] <= target_day].copy()
    temp_df = feature_engineering.add_rolling_window_feature(
        temp_df, window_days=3, column="pm25", new_column="pm25_rolling_3d"
    )
    temp_df = feature_engineering.add_lagged_features(temp_df, column="pm25", lags=[1, 2, 3])
    temp_df = feature_engineering.add_nearby_sensor_feature(
        temp_df,
        locations,
        column="pm25_lag_1d",
        n_closest=3,
        new_column="pm25_nearby_avg",
    )

    current_rows = temp_df[temp_df["date"] == target_day]
    for _, row in current_rows.iterrows():
        sensor_id = row["sensor_id"]
        mask = (batch_data["sensor_id"] == sensor_id) & (batch_data["date"] == target_day)
        if mask.any():
            for col in feature_cols:
                batch_data.loc[mask, f"predicted_{col}"] = row[col]

predictions = batch_data.loc[
    batch_data["predicted_pm25"].notna(),
    ["date", "sensor_id", "predicted_pm25", "days_before_forecast_day"]
    + [f"predicted_{col}" for col in feature_cols],
].reset_index(drop=True)
batch_data.loc[batch_data["date"] > pd.Timestamp(today), "pm25"] = np.nan

## 4.9. Save Predictions
Export prediction results to CSV file in models directory.

In [None]:
# batch_data.to_csv(f"{root_dir}/models/predictions.csv", columns=batch_data.columns, index=False)

save predictions to feature store

In [None]:
predictions_fg = fs.get_or_create_feature_group(
    name="air_quality_predictions",
    version=1,
    primary_key=["sensor_id", "date"],
    description="Daily PM2.5 predictions per sensor",
    event_time="date"
)

# Use the predictions DataFrame that was already created above
print(f"Inserting {len(predictions)} prediction rows into feature group...")
if len(predictions) > 0:
    predictions_fg.insert(predictions, write_options={"wait_for_job": False})
else:
    print("⚠️ No predictions to insert. Check if forecast_days is empty or prediction loop ran correctly.")

## 4.10. Generate Forecast Plots
Create forecast visualization plots for each sensor and upload them to Hopsworks dataset storage.

In [None]:
forecast_paths = []

for sensor_id, location in locations.items():
    sensor_forecast = predictions[predictions["sensor_id"] == sensor_id].copy()

    city, street = location["city"], location["street"]
    forecast_path = f"{root_dir}/models/{sensor_id}/images/forecast.png"
    Path(forecast_path).parent.mkdir(parents=True, exist_ok=True)

    fig = visualization.plot_air_quality_forecast(
        location["city"],
        location["street"],
        sensor_forecast,
        forecast_path,
        hindcast=False,
    )
    plt.close(fig)
    forecast_paths.append((sensor_id, forecast_path))

dataset_api = project.get_dataset_api()
today_short = today.strftime("%Y-%m-%d")
if not dataset_api.exists("Resources/airquality"):
    dataset_api.mkdir("Resources/airquality")

for sensor_id, forecast_path in forecast_paths:
    dataset_api.upload(
        forecast_path,
        f"Resources/airquality/{sensor_id}_{today_short}_forecast.png",
        overwrite=True,
    )
print(f"Forecast plots available in Hopsworks under {project.get_url()}/settings/fb/path/Resources/airquality")

## 4.11. Insert Monitoring Data
Save predictions to monitoring feature group in Hopsworks for tracking.

In [None]:
monitor_fg = fs.get_or_create_feature_group(
    name="aq_predictions",
    description="Air Quality prediction monitoring",
    version=1,
    primary_key=["sensor_id", "date", "days_before_forecast_day"],
    event_time="date",
)

monitor_fg.insert(predictions, wait=True)

## 4.12. Hindcast Analysis
Compare predicted with forecasted values (1-day prior forecast)

In [None]:
monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()
monitoring_df["date"] = pd.to_datetime(monitoring_df["date"]).dt.tz_localize(None)

air_quality_df = air_quality_fg.read()[["date", "sensor_id", "pm25"]]
air_quality_df["date"] = pd.to_datetime(air_quality_df["date"]).dt.tz_localize(None)

for sensor_id, location in locations.items():
    try:
        sensor_preds = monitoring_df[monitoring_df["sensor_id"] == sensor_id][["date", "predicted_pm25"]]
        merged = sensor_preds.merge(
            air_quality_df[air_quality_df["sensor_id"] == sensor_id][["date", "pm25"]],
            on="date",
            how="inner",
        ).sort_values("date")

        city, street = location["city"], location["street"]
        hindcast_path = f"{root_dir}/models/{sensor_id}/images/hindcast_prediction.png"
        Path(hindcast_path).parent.mkdir(parents=True, exist_ok=True)

        plt = visualization.plot_air_quality_forecast(
            city,
            street,
            merged if not merged.empty else sensor_preds.assign(pm25=np.nan),
            hindcast_path,
            hindcast=True,
        )
        plt.close()

        dataset_api.upload(
            hindcast_path,
            f"Resources/airquality/{sensor_id}_{today:%Y-%m-%d}_hindcast.png",
            overwrite=True,
        )

    except Exception as e:
        print(f"⚠️  Error processing hindcast for sensor {sensor_id}: {e}")

## 4.13 IDW Heatmap
IDW - Inverse Distance Weighting

### 4.13.1 IDW interpolation function

In [None]:
def idw_interpolation(points, values, grid_points, lon_mesh, power=2):
    # compute distances between grid points and known data points 
    distances = cdist(grid_points, points)
    # replace 0 with a small value to avoid division by zero
    distances = np.where(distances == 0, 1e-10, distances)
    # compute weights based on inverse distance
    weights = 1.0 / (distances ** power)
    # sum of weights for normalization
    weights_sum = np.sum(weights, axis=1)
    # compute interpolated values - weighted average of known values for each grid point
    interpolated = np.sum(weights * values, axis=1) / weights_sum
    # reshape to the match grid shape
    return interpolated.reshape(lon_mesh.shape)

In [None]:
grid_bounds = tuple(list(json.load(open(f"{root_dir}/frontend/coordinates.json")).values())[:4])
# grid_bounds = map_bounds[1], map_bounds[0], map_bounds[3], map_bounds[2]  # lat_min, lat_max, lon_min, lon_max
print(grid_bounds)

In [None]:
import sys
from pathlib import Path
import hopsworks
import warnings

warnings.filterwarnings("ignore", module="IPython")

def clone_repository(username: str) -> None:
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/{username}/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


root_dir = Path().absolute()
for folder in ("src", "airquality", "notebooks"):
    if root_dir.parts[-1:] == (folder,):
        root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir)

if root_dir not in sys.path:
    sys.path.append(root_dir)

from utils import config

settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()
clone_repository(GITHUB_USERNAME)

In [None]:
# interpolation_dir = f"{root_dir}/models/interpolation"
# if not os.path.exists(interpolation_dir):
#     os.mkdir(interpolation_dir)

# # Use predictions DataFrame which contains all forecast days with PM2.5 values
# interpolation_df = predictions.copy()

In [None]:

# import os

# Always resolve relative to the backend project root
interpolation_dir = os.path.join(os.path.dirname(__file__), "models", "interpolation")

# Ensure the directory exists
os.makedirs(interpolation_dir, exist_ok=True)

# Use predictions DataFrame which contains all forecast days with PM2.5 values
interpolation_df = predictions.copy()

In [None]:

# Add any actual PM2.5 data from today if available
today_actual = batch_data[batch_data["date"] == today_short].copy()

if not today_actual.empty:
    # Ensure both columns exist for the plotting function
    today_actual = today_actual[[col for col in ["date", "sensor_id", "pm25", "predicted_pm25"] if col in today_actual.columns]]
    interpolation_df = pd.concat([today_actual, interpolation_df], ignore_index=True)

for i, forecast_date in enumerate(sorted(interpolation_df["date"].unique())):
    forecast_date_short = forecast_date.strftime("%Y-%m-%d")
    output_png = f"{interpolation_dir}/forecast_interpolation_{i}d.png"
    print(interpolation_df.info())

    plot_pm25_idw_heatmap(
        interpolation_df,
        locations,
        forecast_date,
        output_png,
    )

    dataset_api.upload(
        output_png,
        f"Resources/airquality/interpolation_{today_short}_{forecast_date_short}.png",
        overwrite=True,
    )