# 2. Feature Pipeline

In [None]:
# Fix numpy compatibility issue in Colab
import subprocess
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    try:
        if "google.colab" in str(get_ipython()):
            return True
    except:
        pass
    return False

def clone_repository() -> None:
    # Check if repository already exists
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

def fix_numpy_compatibility():
    print("Fixing numpy compatibility...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--force-reinstall', 'numpy==1.24.3'])
    print("Numpy fixed. Please restart runtime and run again.")


if is_google_colab():
    try:
        import numpy
        # Test if numpy works properly
        numpy.array([1, 2, 3])
        print("Numpy working correctly")

        clone_repository()
        install_dependencies()
        root_dir = str(Path().absolute())
        print("Google Colab environment")
    except ValueError as e:
        if "numpy.dtype size changed" in str(e):
            fix_numpy_compatibility()
            # This will require a runtime restart
            raise SystemExit("Please restart runtime (Runtime > Restart runtime) and run the notebook again.")

else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

from utils import config
import os

if is_google_colab():
    from google.colab import userdata
    
    # Login to Hopsworks
    import hopsworks
    project = hopsworks.login(
        api_key_value=userdata.get('HOPSWORKS_API_KEY'),
        engine="python"
    )
    
    # Get AQICN API key
    AQICN_API_KEY = userdata.get('AQICN_API_KEY')
else:
    # Local development - use .env file
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

In [None]:
# import sys
# from pathlib import Path
# import warnings

# warnings.filterwarnings("ignore", module="IPython")

# def is_google_colab() -> bool:
#     if "google.colab" in str(get_ipython()):
#         return True
#     return False

# def clone_repository() -> None:
#     !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
#     %cd pm25-forecast-openmeteo-aqicn

# def install_dependencies() -> None:
#     !pip install --upgrade uv
#     !uv pip install --all-extras --system --requirement pyproject.toml

# if is_google_colab():
#     clone_repository()
#     install_dependencies()
#     root_dir = str(Path().absolute())
#     print("Google Colab environment")
# else:
#     root_dir = Path().absolute()
#     if root_dir.parts[-1:] == ("src",):
#         root_dir = Path(*root_dir.parts[:-1])
#     if root_dir.parts[-1:] == ("airquality",):
#         root_dir = Path(*root_dir.parts[:-1])
#     if root_dir.parts[-1:] == ("notebooks",):
#         root_dir = Path(*root_dir.parts[:-1])
#     root_dir = str(root_dir)
#     print("Local environment")

# print(f"Root dir: {root_dir}")

# if root_dir not in sys.path:
#     sys.path.append(root_dir)
#     print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# from utils import config

# settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
Added the following directory to the PYTHONPATH: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


## Imports

In [2]:
import datetime
import pandas as pd
import hopsworks
import json
import warnings
import pandas as pd
from utils import airquality

from dotenv import load_dotenv

warnings.filterwarnings("ignore")
load_dotenv()

True

## Setup

In [3]:
HOPSWORKS_API_KEY = getattr(settings, 'HOPSWORKS_API_KEY', None)
if HOPSWORKS_API_KEY is not None and hasattr(HOPSWORKS_API_KEY, 'get_secret_value'):
    HOPSWORKS_API_KEY = HOPSWORKS_API_KEY.get_secret_value()

today = datetime.date.today()
print(f"[DEBUG] Starting hopsworks login at {datetime.datetime.now()}")
project = hopsworks.login(engine="python", api_key_value=HOPSWORKS_API_KEY)
print(f"[DEBUG] Hopsworks login successful at {datetime.datetime.now()}")

print(f"[DEBUG] Getting feature store at {datetime.datetime.now()}")
fs = project.get_feature_store()
print(f"[DEBUG] Feature store retrieved at {datetime.datetime.now()}")

print(f"[DEBUG] Getting secrets API at {datetime.datetime.now()}")
secrets = hopsworks.get_secrets_api()
print(f"[DEBUG] Getting AQICN_API_KEY secret at {datetime.datetime.now()}")
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
print(f"[DEBUG] AQICN_API_KEY retrieved at {datetime.datetime.now()}")

# Retrieve feature groups
print(f"[DEBUG] Getting air_quality_all feature group at {datetime.datetime.now()}")
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)
print(f"[DEBUG] air_quality_all feature group retrieved at {datetime.datetime.now()}")

print(f"[DEBUG] Getting weather_all feature group at {datetime.datetime.now()}")
weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)
print(f"[DEBUG] weather_all feature group retrieved at {datetime.datetime.now()}")

[DEBUG] Starting hopsworks login at 2025-11-26 10:40:22.717085
2025-11-26 10:40:22,717 INFO: Initializing external client
2025-11-26 10:40:22,726 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-26 10:40:24,301 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184
[DEBUG] Hopsworks login successful at 2025-11-26 10:40:25.118627
[DEBUG] Getting feature store at 2025-11-26 10:40:25.118627
[DEBUG] Feature store retrieved at 2025-11-26 10:40:25.284604
[DEBUG] Getting secrets API at 2025-11-26 10:40:25.284604
[DEBUG] Getting AQICN_API_KEY secret at 2025-11-26 10:40:25.284604
[DEBUG] AQICN_API_KEY retrieved at 2025-11-26 10:40:25.428312
[DEBUG] Getting air_quality_all feature group at 2025-11-26 10:40:25.433032
[DEBUG] air_quality_all feature group retrieved at 2025-11-26 10:40:25.968850
[DEBUG] Getting weather_all feature group at 2025-11-26 10:40:25.968850
[DEBUG] weather_all feature group retrieved at 2025-11-26 10:40:26.501489


Set SENSOR_CSV_FILE in .env with the relative path to a sensor to process it, or leave it unset to process all sensors in the `data` folder

In [4]:
sensor_csv_file = getattr(settings, 'SENSOR_CSV_FILE', None)

if sensor_csv_file:
    # Read one secret for single sensor mode
    _, _, _, _, _, sensor_id = airquality.read_sensor_data(sensor_csv_file)
    secret_name = f"SENSOR_LOCATION_JSON_{sensor_id}"
    location_str = secrets.get_secret(secret_name).value
    locations = {sensor_id: json.loads(location_str)}
else:
    # Read all individual secrets in batch mode
    all_secrets = secrets.get_secrets()
    locations = {}
    for secret in all_secrets:
        if secret.name.startswith("SENSOR_LOCATION_JSON_"):
            sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
            location_str = secrets.get_secret(secret.name).value
            if location_str:
                locations[sensor_id] = json.loads(location_str)


## Helper Methods

In [5]:
def get_daily_weather_forecast(city, latitude, longitude):
    hourly_df = airquality.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")
    daily_df = hourly_df.between_time("11:59", "12:01")
    daily_df = daily_df.reset_index()
    daily_df["date"] = pd.to_datetime(daily_df["date"]).dt.date
    daily_df["date"] = pd.to_datetime(daily_df["date"])
    daily_df["city"] = city
    return daily_df


def fetch_data_for_location(location):
    country = location["country"]
    city = location["city"]
    street = location["street"]
    aqicn_url = location["aqicn_url"]
    latitude = location["latitude"]
    longitude = location["longitude"]

    aq_today_df = airquality.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
    daily_df = get_daily_weather_forecast(city, latitude, longitude)
    return aq_today_df, daily_df

## Script

In [6]:
aqs = []
weathers = []
for sensor, location in locations.items():
    aq_today_df, weather_daily_forecast_df = fetch_data_for_location(location)

    # Air quality FG shape
    aq_today_df = aq_today_df.assign(
        sensor_id=str(sensor),
        street=location["street"],
        city=location["city"],
        country=location["country"],
        feed_url=location["aqicn_url"],
    )
    aq_today_df["date"] = pd.to_datetime(aq_today_df["date"])

    # Weather FG shape
    weather_daily_forecast_df = weather_daily_forecast_df.assign(
        sensor_id=str(sensor),
        city=location["city"],
        latitude=location["latitude"],
        longitude=location["longitude"],
    )
    weather_daily_forecast_df["date"] = pd.to_datetime(
        weather_daily_forecast_df["date"]
    )

    aqs.append(aq_today_df)
    weathers.append(weather_daily_forecast_df)

In [7]:
aq_df = pd.concat(aqs)
aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
aq_df = aq_df.drop(columns=["url"], errors="ignore")

# Get historical data for rolling window and lagged features
historical_start = today - datetime.timedelta(days=4)
historical_df = pd.DataFrame()

# Read historical data from feature group and filter for the last 4 days
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        historical_df = historical_df[
            (historical_df["date"].dt.date >= historical_start) & (historical_df["date"].dt.date < today)
        ][["date", "sensor_id", "pm25"]]
except Exception:
    pass


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.53s) 


In [8]:
# Combine historical + new data and calculate rolling window
combined_df = pd.concat([historical_df, aq_df], ignore_index=True) if not historical_df.empty else aq_df
combined_df = airquality.add_rolling_window_feature(combined_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
combined_df = airquality.add_lagged_features(combined_df, column="pm25", lags=[1, 2, 3])
combined_df = airquality.add_nearby_sensor_feature(combined_df, locations, column="pm25_lag_1d", n_closest=3)
aq_df = combined_df[combined_df["date"].dt.date == today].copy()
aq_df

Unnamed: 0,date,sensor_id,pm25,country,city,street,feed_url,pm25_rolling_3d,pm25_lag_1d,pm25_lag_2d,pm25_lag_3d,pm25_nearby_avg
42,2025-11-26,112672,39.0,Sweden,Gothenburg,Bågskyttegatan,https://api.waqi.info/feed/A112672/,,,,,
50,2025-11-26,129124,0.0,Sweden,Södermalms stadsdelsområde,Rosenlundsgatan,https://api.waqi.info/feed/A129124/,0.333333,0.4,0.1,0.5,3.016667
36,2025-11-26,154549,37.0,Sweden,Västra Göteborg,Järnbrottsgatan,https://api.waqi.info/feed/A154549/,,,,,
43,2025-11-26,180187,0.0,Sweden,Kungsholmens stadsdelsområde,Kronobergsgatan,https://api.waqi.info/feed/A180187/,0.166667,0.2,0.1,0.2,2.296667
44,2025-11-26,194215,54.0,Sweden,Torslanda,Norra Sävviksvägen,https://api.waqi.info/feed/A194215/,,,,,
51,2025-11-26,252352,26.0,Sweden,Hägersten-Älvsjö stadsdelsområde,Rödhakevägen,https://api.waqi.info/feed/A252352/,5.986667,10.43,4.53,3.0,3.786667
33,2025-11-26,404209,10.0,Sweden,Lindome,Högkullevägen,https://api.waqi.info/feed/A404209/,,,,,
37,2025-11-26,474841,0.0,Sweden,Hägersten-Älvsjö stadsdelsområde,ybohovsbacken-48a,https://api.waqi.info/feed/A474841/,1.053333,1.21,1.95,0.0,1.96
46,2025-11-26,59095,19.0,Sweden,Mölndal,Eklanda Slätt,https://api.waqi.info/feed/A59095/,,,,,
38,2025-11-26,59893,5.0,Sweden,Lundby,Londongatan,https://api.waqi.info/feed/A59893/,,,,,


In [9]:
weather_df = pd.concat(weathers)
weather_df["date"] = pd.to_datetime(weather_df["date"])
# Convert to float32 to match Hopsworks feature group schema
weather_df["temperature_2m_mean"] = weather_df["temperature_2m_mean"].astype("float32")
weather_df["precipitation_sum"] = weather_df["precipitation_sum"].astype("float32")
weather_df["wind_speed_10m_max"] = weather_df["wind_speed_10m_max"].astype("float32")
weather_df["wind_direction_10m_dominant"] = weather_df["wind_direction_10m_dominant"].astype("float32")
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city,sensor_id,latitude,longitude
0,2025-11-26,-0.70,0.0,1.484318,75.963730,Norra Hisingen,61714,57.750000,11.970000
1,2025-11-27,4.25,0.9,27.908709,186.666565,Norra Hisingen,61714,57.750000,11.970000
2,2025-11-28,8.85,0.0,23.166218,212.949219,Norra Hisingen,61714,57.750000,11.970000
3,2025-11-29,7.40,0.1,21.578989,207.847488,Norra Hisingen,61714,57.750000,11.970000
4,2025-11-30,4.85,0.0,13.708390,119.931427,Norra Hisingen,61714,57.750000,11.970000
...,...,...,...,...,...,...,...,...,...
2,2025-11-28,8.70,0.0,23.166218,212.949219,Centrum,81505,57.686704,11.974203
3,2025-11-29,7.25,0.1,21.578989,207.847488,Centrum,81505,57.686704,11.974203
4,2025-11-30,4.70,0.0,13.708390,119.931427,Centrum,81505,57.686704,11.974203
5,2025-12-01,6.05,0.2,21.734581,206.564987,Centrum,81505,57.686704,11.974203


In [10]:
air_quality_fg.insert(aq_df)
weather_fg.insert(weather_df)

2025-11-26 10:41:15,506 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972


Uploading Dataframe: 100.00% |██████████| Rows 24/24 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-11-26 10:41:29,952 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130


Uploading Dataframe: 100.00% |██████████| Rows 168/168 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: weather_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions


(Job('weather_all_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "precipitation_sum",
           "min_value": -0.1,
           "max_value": 1000.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 781318
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 168,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-26T09:41:29.000952Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_c