In [33]:
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

from utils import config

settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/max/Repos/KTH/pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


## Imports

In [34]:
import datetime
import pandas as pd
import hopsworks
import json
import warnings
import pandas as pd
from utils import airquality

warnings.filterwarnings("ignore")

## Setup

In [35]:
today = datetime.date.today()
project = hopsworks.login(engine="python")
fs = project.get_feature_store()

secrets = hopsworks.get_secrets_api()
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)
weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

2025-11-17 14:39:23,096 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-11-17 14:39:23,099 INFO: Initializing external client
2025-11-17 14:39:23,099 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-17 14:39:24,519 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279179


Set SENSOR_CSV_FILE in .env with the relative path to a sensor to process it, or leave it unset to process all sensors in the `data` folder

In [36]:
sensor_csv_file = getattr(settings, 'SENSOR_CSV_FILE', None)

if sensor_csv_file:
    # Read one secret for single sensor mode
    _, _, _, _, _, sensor_id = airquality.read_sensor_data(sensor_csv_file)
    secret_name = f"SENSOR_LOCATION_JSON_{sensor_id}"
    location_str = secrets.get_secret(secret_name).value
    locations = {sensor_id: json.loads(location_str)}
else:
    # Read all individual secrets in batch mode
    all_secrets = secrets.get_secrets()
    locations = {}
    for secret in all_secrets:
        if secret.name.startswith("SENSOR_LOCATION_JSON_"):
            sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
            location_str = secrets.get_secret(secret.name).value
            if location_str:
                locations[sensor_id] = json.loads(location_str)


## Helper Methods

In [37]:
def get_daily_weather_forecast(city, latitude, longitude):
    hourly_df = airquality.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")
    daily_df = hourly_df.between_time("11:59", "12:01")
    daily_df = daily_df.reset_index()
    daily_df["date"] = pd.to_datetime(daily_df["date"]).dt.date
    daily_df["date"] = pd.to_datetime(daily_df["date"])
    daily_df["city"] = city
    return daily_df


def fetch_data_for_location(location):
    country = location["country"]
    city = location["city"]
    street = location["street"]
    aqicn_url = location["aqicn_url"]
    latitude = location["latitude"]
    longitude = location["longitude"]

    aq_today_df = airquality.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
    daily_df = get_daily_weather_forecast(city, latitude, longitude)
    return aq_today_df, daily_df

## Script

In [38]:
aqs = []
weathers = []
for sensor, location in locations.items():
    aq_today_df, weather_daily_forecast_df = fetch_data_for_location(location)

    # Air quality FG shape
    aq_today_df = aq_today_df.assign(
        sensor_id=str(sensor),
        street=location["street"],
        city=location["city"],
        country=location["country"],
        feed_url=location["aqicn_url"],
    )
    aq_today_df["date"] = pd.to_datetime(aq_today_df["date"])

    # Weather FG shape
    weather_daily_forecast_df = weather_daily_forecast_df.assign(
        sensor_id=str(sensor),
        city=location["city"],
        latitude=location["latitude"],
        longitude=location["longitude"],
    )
    weather_daily_forecast_df["date"] = pd.to_datetime(
        weather_daily_forecast_df["date"]
    )

    aqs.append(aq_today_df)
    weathers.append(weather_daily_forecast_df)

In [45]:
aq_df = pd.concat(aqs)
aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
aq_df = aq_df.drop(columns=["url"], errors="ignore")

# Get historical data for rolling window and lagged features
historical_start = today - datetime.timedelta(days=4)
historical_df = pd.DataFrame()

# Read historical data from feature group and filter for the last 4 days
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        historical_df = historical_df[
            (historical_df["date"].dt.date >= historical_start) & (historical_df["date"].dt.date < today)
        ][["date", "sensor_id", "pm25"]]
except Exception:
    pass


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.69s) 


In [46]:
# Combine historical + new data and calculate rolling window
combined_df = pd.concat([historical_df, aq_df], ignore_index=True) if not historical_df.empty else aq_df
combined_df = airquality.add_rolling_window_feature(combined_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
combined_df = airquality.add_lagged_features(combined_df, column="pm25", lags=[1, 2, 3])
combined_df = airquality.add_nearby_sensor_feature(combined_df, locations, column="pm25_lag_1d", n_closest=3)
aq_df = combined_df[combined_df["date"].dt.date == today].copy()
aq_df

Unnamed: 0,date,sensor_id,pm25,country,city,street,feed_url,pm25_rolling_3d,pm25_lag_1d,pm25_lag_2d,pm25_lag_3d,pm25_nearby_avg
68,2025-11-17,112672,5.0,Sweden,Gothenburg,Bågskyttegatan,https://api.waqi.info/feed/A112672/,1.726667,1.4,1.65,2.13,0.7
56,2025-11-17,154549,2.0,Sweden,Västra Göteborg,Järnbrottsgatan,https://api.waqi.info/feed/A154549/,1.2,1.0,1.2,1.4,0.7
66,2025-11-17,194215,1.0,Sweden,Torslanda,Norra Sävviksvägen,https://api.waqi.info/feed/A194215/,0.803333,0.73,0.7,0.98,0.8
59,2025-11-17,404209,3.0,Sweden,Lindome,Högkullevägen,https://api.waqi.info/feed/A404209/,1.016667,0.95,1.3,0.8,0.8
60,2025-11-17,59095,2.0,Sweden,Mölndal,Eklanda Slätt,https://api.waqi.info/feed/A59095/,1.09,1.1,1.0,1.17,0.8
69,2025-11-17,59893,4.0,Sweden,Lundby,Londongatan,https://api.waqi.info/feed/A59893/,0.9,0.8,0.8,1.1,0.8
61,2025-11-17,60535,2.0,Sweden,Majorna-Linné,Annedal,https://api.waqi.info/feed/A60535/,1.433333,0.93,1.57,1.8,0.65
57,2025-11-17,60541,12.0,Sweden,Majorna-Linné,Prinsgatan,https://api.waqi.info/feed/A60541/,,,,,0.743333
62,2025-11-17,60853,1.0,Sweden,Majorna-Linné,Masthugget,https://api.waqi.info/feed/A60853/,0.81,0.6,0.83,1.0,0.815
64,2025-11-17,61714,2.0,Sweden,Norra Hisingen,Nyhemsgatan,https://api.waqi.info/feed/A61714/,0.963333,0.79,1.0,1.1,0.9


In [47]:
weather_df = pd.concat(weathers)
weather_df["date"] = pd.to_datetime(weather_df["date"])
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city,sensor_id,latitude,longitude
0,2025-11-17,1.60,0.0,4.072935,315.000092,Västra Göteborg,154549,57.67800,11.910000
1,2025-11-18,1.40,0.1,1.527351,135.000107,Västra Göteborg,154549,57.67800,11.910000
2,2025-11-19,-0.45,0.0,9.673221,44.999897,Västra Göteborg,154549,57.67800,11.910000
3,2025-11-20,-0.50,0.0,18.678415,27.552727,Västra Göteborg,154549,57.67800,11.910000
4,2025-11-21,-3.05,0.0,0.804985,333.435028,Västra Göteborg,154549,57.67800,11.910000
...,...,...,...,...,...,...,...,...,...
2,2025-11-19,-0.60,0.0,9.673221,44.999897,Centrum,69724,57.67894,11.976321
3,2025-11-20,-0.65,0.0,18.678415,27.552727,Centrum,69724,57.67894,11.976321
4,2025-11-21,-3.20,0.0,0.804985,333.435028,Centrum,69724,57.67894,11.976321
5,2025-11-22,4.65,0.0,15.277749,214.439041,Centrum,69724,57.67894,11.976321


In [48]:
air_quality_fg.insert(aq_df)
weather_fg.insert(weather_df)

2025-11-17 14:42:33,980 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279179/fs/1265797/fg/1721857


Uploading Dataframe: 100.00% |██████████| Rows 15/15 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: air_quality_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279179/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-11-17 14:42:47,605 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279179/fs/1265797/fg/1718803


Uploading Dataframe: 100.00% |██████████| Rows 105/105 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279179/jobs/named/weather_all_1_offline_fg_materialization/executions


(Job('weather_all_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "precipitation_sum",
           "min_value": -0.1,
           "max_value": 1000.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 758940
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 105,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-17T01:42:47.000605Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_c