# 2. Feature Pipeline

## 2.1. Setup

In [None]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata

#  Load settings 
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-09 16:01:22,969 INFO: Initializing external client
2026-01-09 16:01:22,969 INFO: Base URL: https://c.app.hopsworks.ai:443


Repository management

In [None]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

In [None]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

## 2.3. Get Feature Groups

In [None]:
air_quality_fg, sensor_metadata_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.4. Load Metadata from Feature Group

In [None]:
metadata_df = sensor_metadata_fg.read()
if len(metadata_df) == 0:
    print("‚ö†Ô∏è No sensor metadata found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

print(f"üìç Loaded metadata for {len(metadata_df)} sensors")
metadata_df = metadata_df.set_index("sensor_id")

## 2.5. Data Collection
Loop through all sensors to fetch today's air quality data and weather forecasts, format data to match feature group schemas.

In [None]:
# Load metadata from feature group for nearby sensor calculations
metadata_indexed = metadata_df.copy()
metadata_indexed.index = metadata_indexed.index.astype(int)

successful = 0
failed = 0
skipped = 0

print(f"üîç Processing {len(metadata_df)} sensor locations...")

Load historical Air Quality data for all sensors

In [None]:
historical_start = today - timedelta(days=4)
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_df = historical_df[
            (historical_df["date"] >= pd.to_datetime(historical_start)) & 
            (historical_df["date"] < today_dt)
        ][["date", "sensor_id", "pm25"]]
        # Only keep sensors that exist in metadata
        historical_df = historical_df[historical_df["sensor_id"].isin(metadata_indexed.index)]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

Initialize containers for results

In [None]:
aq_list = []
weather_dict = {}  # location_id -> weather_df

Data collection

In [None]:
for sensor_id, meta in metadata_df.iterrows():
    try:
        # Fetch current air quality
        aq_today_df = fetchers.get_pm25(meta["aqicn_url"], meta["country"], meta["city"], 
                                       meta["street"], today, AQICN_API_KEY)
        
        if aq_today_df.empty or aq_today_df['pm25'].isna().all():
            skipped += 1
            continue
        
        # Format air quality data
        aq_today_df["sensor_id"] = int(sensor_id)
        aq_today_df["location_id"] = int(meta["location_id"])
        aq_today_df["pm25"] = pd.to_numeric(aq_today_df["pm25"], errors="coerce")
        aq_today_df["date"] = pd.to_datetime(aq_today_df["date"]).dt.tz_localize(None)
        aq_today_df = aq_today_df.drop(columns=["url", "country", "city", "street"], errors="ignore")
        
        # Combine with historical data (last 4 days)
        sensor_historical = historical_df[historical_df["sensor_id"] == sensor_id] if not historical_df.empty else pd.DataFrame()
        combined = pd.concat([sensor_historical, aq_today_df], ignore_index=True) if not sensor_historical.empty else aq_today_df
        combined = combined.sort_values("date").reset_index(drop=True)
        
        # Add features using ALL data (historical + today)
        combined = feature_engineering.add_rolling_window_feature(combined, window_days=3, column="pm25", new_column="pm25_rolling_3d")
        combined = feature_engineering.add_lagged_features(combined, column="pm25", lags=[1, 2, 3])
        combined = feature_engineering.add_nearby_sensor_feature(combined, metadata_indexed, n_closest=3)
        
        # NOW filter to only today - but features are already calculated
        aq_final = combined[combined["date"].dt.date == today].copy()
        
        # Check if we actually have today's data with valid pm25
        if aq_final.empty or aq_final['pm25'].isna().all():
            skipped += 1
            continue
        
        aq_list.append(aq_final)
        
        # Fetch weather for location (once per location)
        location_id = int(meta["location_id"])
        if location_id not in weather_dict:
            end_date = today + timedelta(days=7)
            weather_df = fetchers.get_weather_forecast(location_id, today, end_date, 
                                                      meta["latitude"], meta["longitude"])
            if not weather_df.empty:
                weather_df["location_id"] = location_id
                weather_df["date"] = pd.to_datetime(weather_df["date"])
                weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
                weather_dict[location_id] = weather_df
        
        successful += 1
        if successful % 10 == 0:
            print(f"‚úÖ Processed {successful}/{len(metadata_df)} sensors")
            
    except Exception as e:
        failed += 1
        print(f"‚ùå Sensor {sensor_id}: {type(e).__name__}")
        continue

Batch insert Air Quality

In [None]:
if aq_list:
    all_aq = pd.concat(aq_list, ignore_index=True)
    
    # Convert types
    all_aq = all_aq.astype({
        "sensor_id": "int32",
        "location_id": "int32",
        "pm25": "float64",
        "pm25_lag_1d": "float64",
        "pm25_lag_2d": "float64",
        "pm25_lag_3d": "float64",
        "pm25_rolling_3d": "float64",
        "pm25_nearby_avg": "float64",
    })
    
    # Ensure correct column order
    fg_columns = [f.name for f in air_quality_fg.features]
    all_aq = all_aq[fg_columns]
    
    air_quality_fg.insert(all_aq)
    print(f"üìä Inserted {len(all_aq)} air quality records")

Batch insert Weather

In [None]:
if weather_dict:
    all_weather = pd.concat(weather_dict.values(), ignore_index=True)
    
    # Convert types
    all_weather = all_weather.astype({
        "location_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    weather_fg.insert(all_weather)
    print(f"üå§Ô∏è Inserted {len(all_weather)} weather records for {len(weather_dict)} locations")

Print summary

In [None]:
print(f"\nüìä Summary: ‚úÖ {successful} successful, ‚è≠Ô∏è {skipped} skipped, ‚ùå {failed} failed")

## 2.6. Inspect Inserted Data

In [None]:
if 'all_aq' in locals() and not all_aq.empty:
    print(f"‚úÖ Air quality records inserted: {len(all_aq)}")
    print("\nüìã Sample air quality data:")
    print(all_aq.head())
    print("\nüîß Air quality data types:")
    print(all_aq.dtypes)
    print("\nüìÖ Date range:")
    print(f"From {all_aq['date'].min()} to {all_aq['date'].max()}")

if 'all_weather' in locals() and not all_weather.empty:
    print(f"\nüå§Ô∏è Weather records inserted: {len(all_weather)}")
    print("\nüìã Sample weather data:")
    print(all_weather.head())
    print("\nüîß Weather data types:")
    print(all_weather.dtypes)
    print("\nüìÖ Unique weather dates:")
    print(all_weather['date'].unique())