# 2. Feature Pipeline

## 2.1. Setup

In [1]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata

#  Load settings 
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-09 15:05:51,279 INFO: Initializing external client
2026-01-09 15:05:51,280 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-09 15:05:53,364 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


Repository management

In [2]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

Already in repo at c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn


In [3]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

## 2.3. Get Feature Groups

In [4]:
air_quality_fg, sensor_metadata_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.4. Load Metadata from Feature Group

In [5]:
metadata_df = sensor_metadata_fg.read()
if len(metadata_df) == 0:
    print("‚ö†Ô∏è No sensor metadata found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

print(f"üìç Loaded metadata for {len(metadata_df)} sensors")
metadata_df = metadata_df.set_index("sensor_id")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.76s) 
üìç Loaded metadata for 103 sensors


## 2.5. Data Collection
Loop through all sensors to fetch today's air quality data and weather forecasts, format data to match feature group schemas.

In [None]:
# Load metadata from feature group for nearby sensor calculations
metadata_indexed = metadata_df.copy()
metadata_indexed.index = metadata_indexed.index.astype(int)

successful = 0
failed = 0
skipped = 0

print(f"üîç Processing {len(metadata_df)} sensor locations...")

# Get historical data once for all sensors
historical_start = today - timedelta(days=4)
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_df = historical_df[
            (historical_df["date"] >= pd.to_datetime(historical_start)) & 
            (historical_df["date"] < today_dt)
        ][["date", "sensor_id", "pm25"]]
        # Only keep sensors that exist in metadata
        historical_df = historical_df[historical_df["sensor_id"].isin(metadata_indexed.index)]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

# Collect all air quality and weather data
aq_list = []
weather_dict = {}  # location_id -> weather_df

for sensor_id, meta in metadata_df.iterrows():
    try:
        # Fetch current air quality
        aq_today_df = fetchers.get_pm25(meta["aqicn_url"], meta["country"], meta["city"], 
                                       meta["street"], today, AQICN_API_KEY)
        
        if aq_today_df.empty or aq_today_df['pm25'].isna().all():
            skipped += 1
            continue
        
        # Format air quality data
        aq_today_df["sensor_id"] = int(sensor_id)
        aq_today_df["location_id"] = int(meta["location_id"])
        aq_today_df["pm25"] = pd.to_numeric(aq_today_df["pm25"], errors="coerce")
        aq_today_df["date"] = pd.to_datetime(aq_today_df["date"]).dt.tz_localize(None)
        aq_today_df = aq_today_df.drop(columns=["url", "country", "city", "street"], errors="ignore")
        
        # Combine with historical data
        sensor_historical = historical_df[historical_df["sensor_id"] == sensor_id] if not historical_df.empty else pd.DataFrame()
        combined = pd.concat([sensor_historical, aq_today_df], ignore_index=True) if not sensor_historical.empty else aq_today_df
        combined = combined.sort_values("date").reset_index(drop=True)
        
        # Add features
        combined = feature_engineering.add_rolling_window_feature(combined, window_days=3, column="pm25", new_column="pm25_rolling_3d")
        combined = feature_engineering.add_lagged_features(combined, column="pm25", lags=[1, 2, 3])
        combined = feature_engineering.add_nearby_sensor_feature(combined, metadata_indexed, n_closest=3)
        
        # Keep only today's data
        aq_final = combined[combined["date"].dt.date == today].copy()
        aq_final = aq_final.dropna(subset=['pm25'])
        
        if aq_final.empty:
            skipped += 1
            continue
        
        aq_list.append(aq_final)
        
        # Fetch weather for this location (once per location)
        location_id = int(meta["location_id"])
        if location_id not in weather_dict:
            end_date = today + timedelta(days=7)
            weather_df = fetchers.get_weather_forecast(location_id, today, end_date, 
                                                      meta["latitude"], meta["longitude"])
            if not weather_df.empty:
                weather_df["location_id"] = location_id
                weather_df["date"] = pd.to_datetime(weather_df["date"])
                weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
                weather_dict[location_id] = weather_df
        
        successful += 1
        if successful % 10 == 0:
            print(f"‚úÖ Processed {successful}/{len(metadata_df)} sensors")
            
    except Exception as e:
        failed += 1
        print(f"‚ùå Sensor {sensor_id}: {type(e).__name__}")
        continue

# Batch insert all air quality data
if aq_list:
    all_aq = pd.concat(aq_list, ignore_index=True)
    
    # Convert types
    all_aq = all_aq.astype({
        "sensor_id": "int32",
        "location_id": "int32",
        "pm25": "float64",
        "pm25_lag_1d": "float64",
        "pm25_lag_2d": "float64",
        "pm25_lag_3d": "float64",
        "pm25_rolling_3d": "float64",
        "pm25_nearby_avg": "float64",
    })
    
    # Ensure correct column order
    fg_columns = [f.name for f in air_quality_fg.features]
    all_aq = all_aq[fg_columns]
    
    air_quality_fg.insert(all_aq)
    print(f"üìä Inserted {len(all_aq)} air quality records")

# Batch insert all weather data
if weather_dict:
    all_weather = pd.concat(weather_dict.values(), ignore_index=True)
    
    # Convert types
    all_weather = all_weather.astype({
        "location_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    weather_fg.insert(all_weather)
    print(f"üå§Ô∏è Inserted {len(all_weather)} weather records for {len(weather_dict)} locations")

print(f"\nüìä Summary: ‚úÖ {successful} successful, ‚è≠Ô∏è {skipped} skipped, ‚ùå {failed} failed")

üîç Processing 103 sensor locations...
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (12.23s) 
‚úÖ Processed 10/103 sensors
‚úÖ Processed 20/103 sensors
‚úÖ Processed 30/103 sensors
‚úÖ Processed 40/103 sensors
‚úÖ Processed 50/103 sensors
‚úÖ Processed 60/103 sensors
‚úÖ Processed 70/103 sensors
‚úÖ Processed 80/103 sensors
‚úÖ Processed 90/103 sensors
‚úÖ Processed 100/103 sensors
2026-01-09 15:10:27,707 INFO: 	7 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1911228


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_1_offline_fg_materialization/executions
üìä Inserted 103 air quality records
2026-01-09 15:11:17,678 INFO: 	4 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1893863


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 816/816 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_1_offline_fg_materialization/executions
üå§Ô∏è Inserted 816 weather records for 102 locations

üìä Summary: ‚úÖ 103 successful, ‚è≠Ô∏è 0 skipped, ‚ùå 0 failed


In [7]:
# # Load metadata from feature group for nearby sensor calculations
# metadata_indexed = metadata_df.copy()
# # Ensure index is int type to match sensor_id values in data
# metadata_indexed.index = metadata_indexed.index.astype(int)

# successful_sensors = 0
# failed_sensors = 0
# skipped_sensors = 0
# location_weather_uploaded = set()  # Track which location weather we've already uploaded

# print(f"üîç Processing {len(metadata_df)} sensor locations...")

# # Get historical data once for all sensors (for rolling/lag features)
# historical_start = today - timedelta(days=4)
# try:
#     # Read all data (Python env doesn't support column selection)
#     historical_df = air_quality_fg.read()
#     if not historical_df.empty:
#         historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
#         # Convert today to datetime for comparison
#         today_dt = pd.to_datetime(today)
#         historical_start_dt = pd.to_datetime(historical_start)
#         # Filter in pandas instead
#         historical_df = historical_df[
#             (historical_df["date"] >= historical_start_dt) & (historical_df["date"] < today_dt)
#         ][["date", "sensor_id", "pm25"]]
#         # IMPORTANT: Only keep historical data for sensors that exist in metadata
#         existing_sensor_ids = metadata_indexed.index.tolist()
#         historical_df = historical_df[historical_df["sensor_id"].isin(existing_sensor_ids)]
#     else:
#         historical_df = pd.DataFrame()
# except Exception as e:
#     print(f"‚ö†Ô∏è Error reading historical data: {e}")
#     historical_df = pd.DataFrame()

# for sensor_id, meta in metadata_df.iterrows():
#     max_retries = 3
    
#     for attempt in range(max_retries):
#         try:
#             # Fetch current air quality
#             aq_today_df = fetchers.get_pm25(meta["aqicn_url"], meta["country"], meta["city"], 
#                                            meta["street"], today, AQICN_API_KEY)
            
#             # Check if we got data
#             if aq_today_df.empty or aq_today_df['pm25'].isna().all():
#                 print(f"‚è≠Ô∏è Sensor {sensor_id}: No AQ data available")
#                 skipped_sensors += 1
#                 break
            
#             # Format air quality data
#             aq_today_df = aq_today_df.assign(
#                 sensor_id=int(sensor_id),
#                 location_id=int(meta["location_id"])
#             )
#             aq_today_df["pm25"] = pd.to_numeric(aq_today_df["pm25"], errors="coerce").astype("float64")
#             aq_today_df["date"] = pd.to_datetime(aq_today_df["date"]).dt.tz_localize(None)
#             aq_today_df = aq_today_df.drop(columns=["url", "country", "city", "street"], errors="ignore")
            
#             # Combine with historical data for this sensor
#             sensor_historical = historical_df[historical_df["sensor_id"] == sensor_id] if not historical_df.empty else pd.DataFrame()
#             combined = pd.concat([sensor_historical, aq_today_df], ignore_index=True) if not sensor_historical.empty else aq_today_df
            
#             # Sort by date for proper lag/rolling calculations
#             combined = combined.sort_values("date").reset_index(drop=True)
            
#             # Add features
#             combined = feature_engineering.add_rolling_window_feature(combined, window_days=3, column="pm25", new_column="pm25_rolling_3d")
#             combined = feature_engineering.add_lagged_features(combined, column="pm25", lags=[1, 2, 3])
            
#             # Add nearby sensor feature - only for this sensor's data
#             combined = feature_engineering.add_nearby_sensor_feature(combined, metadata_indexed, n_closest=3)
            
#             # Keep only today's data
#             today_dt = pd.to_datetime(today)
#             aq_final = combined[combined["date"].dt.date == today].copy()
#             aq_final = aq_final.dropna(subset=['pm25'])
            
#             if aq_final.empty:
#                 print(f"‚è≠Ô∏è Sensor {sensor_id}: No valid data after processing")
#                 skipped_sensors += 1
#                 break
            
#             # Ensure correct data types and column order
#             aq_final["sensor_id"] = aq_final["sensor_id"].astype("int32")
#             aq_final["location_id"] = aq_final["location_id"].astype("int32")
            
#             # Get expected columns from feature group
#             fg_columns = [f.name for f in air_quality_fg.features]
#             aq_final = aq_final[fg_columns]
            
#             # Insert air quality data immediately
#             air_quality_fg.insert(aq_final)
            
#             # Fetch and upload weather for this location (if not already done)
#             location_id = int(meta["location_id"])
#             if location_id not in location_weather_uploaded:
#                 end_date = today + timedelta(days=7)
#                 weather_df = fetchers.get_weather_forecast(location_id, today, end_date, 
#                                                           meta["latitude"], meta["longitude"])
                
#                 if not weather_df.empty:
#                     weather_df["location_id"] = int(location_id)
#                     weather_df["date"] = pd.to_datetime(weather_df["date"])
#                     weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
                    
#                     # Convert to correct types to match schema
#                     weather_df["location_id"] = weather_df["location_id"].astype("int32")
#                     weather_df["temperature_2m_mean"] = weather_df["temperature_2m_mean"].astype("float64")
#                     weather_df["precipitation_sum"] = weather_df["precipitation_sum"].astype("float64")
#                     weather_df["wind_speed_10m_max"] = weather_df["wind_speed_10m_max"].astype("float64")
#                     weather_df["wind_direction_10m_dominant"] = weather_df["wind_direction_10m_dominant"].astype("float64")
                    
#                     weather_fg.insert(weather_df)
#                     location_weather_uploaded.add(location_id)
            
#             successful_sensors += 1
#             print(f"‚úÖ Uploaded sensor {sensor_id} (location {location_id})")
#             break  # Success, exit retry loop
            
#         except requests.exceptions.Timeout as e:
#             if attempt < max_retries - 1:
#                 wait_time = (attempt + 1) * 5
#                 print(f"‚ö†Ô∏è  Sensor {sensor_id}: Timeout, retrying in {wait_time}s... (attempt {attempt + 1}/{max_retries})")
#                 time.sleep(wait_time)
#             else:
#                 failed_sensors += 1
#                 print(f"‚ùå Sensor {sensor_id}: Failed after {max_retries} timeout attempts")
#                 break
                
#         except requests.exceptions.RequestException as e:
#             if attempt < max_retries - 1:
#                 wait_time = (attempt + 1) * 5
#                 print(f"‚ö†Ô∏è  Sensor {sensor_id}: {type(e).__name__}, retrying in {wait_time}s... (attempt {attempt + 1}/{max_retries})")
#                 time.sleep(wait_time)
#             else:
#                 failed_sensors += 1
#                 print(f"‚ùå Sensor {sensor_id}: Failed after {max_retries} attempts - {type(e).__name__}")
#                 break
                
#         except Exception as e:
#             failed_sensors += 1
#             print(f"‚ùå Sensor {sensor_id}: Unexpected error - {type(e).__name__}: {str(e)[:100]}")
#             import traceback
#             print(f"   {traceback.format_exc()[:300]}")
#             break
    
#     # Brief pause between sensors
#     time.sleep(0.5)

# print(f"\nüìä Collection Summary:")
# print(f"   ‚úÖ Successful: {successful_sensors}")
# print(f"   ‚è≠Ô∏è Skipped (no data): {skipped_sensors}")
# print(f"   ‚ùå Failed: {failed_sensors}")
# print(f"   üå§Ô∏è Weather locations uploaded: {len(location_weather_uploaded)}")

In [8]:
# aq_df = pd.concat(aqs)
aq_df = pd.concat(aq_list)
aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
aq_df = aq_df.drop(columns=["url"], errors="ignore")

In [9]:
aq_df = pd.concat(aq_list) if aq_list else pd.DataFrame()
if not aq_df.empty:
    aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
    aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
    aq_df = aq_df.drop(columns=["url"], errors="ignore")

    # Data quality check 1: Remove rows with missing PM2.5 values
    initial_count = len(aq_df)
    aq_df = aq_df.dropna(subset=['pm25'])
    if len(aq_df) < initial_count:
        print(f"üßπ Removed {initial_count - len(aq_df)} rows with missing PM2.5 values")

# Get historical data for rolling window and lagged features
historical_start = today - timedelta(days=4)
historical_df = pd.DataFrame()

# Read historical data from feature group and filter for the last 4 days
try:
    # cols = [f.name for f in air_quality_fg.features] 
    # historical_df = air_quality_fg.read(cols)
    historical_df = air_quality_fg.read()
    historical_df = historical_df[["date", "sensor_id", "pm25"]]
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_start_dt = pd.to_datetime(historical_start)
        historical_df = historical_df[
            (historical_df["date"] >= historical_start_dt) & (historical_df["date"] < today_dt)
        ]
except Exception as e:
    print(f"Error reading historical data: {e}")
    historical_df = pd.DataFrame()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.87s) 


In [None]:
# dtype_map = {
#     "sensor_id": "int32",
#     "location_id": "int32",
#     "date": "datetime64[ns]",
#     "pm25": "float64",
#     "pm25_lag_1d": "float64",
#     "pm25_lag_2d": "float64",
#     "pm25_lag_3d": "float64",
#     "pm25_rolling_3d": "float64",
#     "pm25_nearby_avg": "float64",
# }

# # Cast both dataframes
# for col, dtype in dtype_map.items():
#     if col in historical_df.columns:
#         historical_df[col] = historical_df[col].astype(dtype, errors="ignore")
#     if col in aq_df.columns:
#         aq_df[col] = aq_df[col].astype(dtype, errors="ignore")


# combined_df = pd.concat([historical_df, aq_df], ignore_index=True)
# combined_df = combined_df.reset_index(drop=True)
# if not combined_df.empty:
#     combined_df = feature_engineering.add_rolling_window_feature(combined_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
#     combined_df = feature_engineering.add_lagged_features(combined_df, column="pm25", lags=[1, 2, 3])
#     combined_df = feature_engineering.add_nearby_sensor_feature(combined_df, metadata_df.to_dict('index'), column="pm25_lag_1d", n_closest=3)
    
#     # Data quality check 2: Clean up NaNs created by feature engineering
#     before_cleaning = len(combined_df[combined_df["date"].dt.date == today])
    
#     # Only keep today's data and remove rows where essential features are NaN
#     aq_df = combined_df[combined_df["date"].dt.date == today].copy()
    
#     # Remove rows where pm25 is still NaN after all processing
#     aq_df = aq_df.dropna(subset=['pm25'])
    
#     after_cleaning = len(aq_df)
#     if before_cleaning > after_cleaning:
#         print(f"üßπ Removed {before_cleaning - after_cleaning} rows with NaN values after feature engineering")
    
#     print(f"üìä Final data quality: {len(aq_df)} clean rows ready for feature store")
# else:
#     aq_df = pd.DataFrame()
#     print("‚ö†Ô∏è  No data available for processing")
# aq_df.head()

ValueError: cannot reindex on an axis with duplicate labels

In [None]:
weather_df = pd.concat(weathers) if weathers else pd.DataFrame()
if not weather_df.empty:
    weather_df["date"] = pd.to_datetime(weather_df["date"])
    
    # Data quality check 3: Remove rows with missing weather data
    initial_weather_count = len(weather_df)
    weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
    
    # Convert to float32 to match Hopsworks feature group schema
    weather_df["temperature_2m_mean"] = weather_df["temperature_2m_mean"].astype("float32")
    weather_df["precipitation_sum"] = weather_df["precipitation_sum"].astype("float32")
    weather_df["wind_speed_10m_max"] = weather_df["wind_speed_10m_max"].astype("float32")
    weather_df["wind_direction_10m_dominant"] = weather_df["wind_direction_10m_dominant"].astype("float32")
    
    if len(weather_df) < initial_weather_count:
        print(f"üßπ Removed {initial_weather_count - len(weather_df)} rows with missing weather data")
    
    print(f"üå§Ô∏è  Weather data quality: {len(weather_df)} clean weather rows")
else:
    print("‚ö†Ô∏è  No weather data available")
weather_df.head()

In [None]:
print(weather_df['date'].unique())

In [None]:
# Final validation before inserting to feature store
if not aq_df.empty and not weather_df.empty:
    print(f"‚úÖ Inserting {len(aq_df)} air quality rows and {len(weather_df)} weather rows to feature store")
    air_quality_fg.insert(aq_df)
    weather_fg.insert(weather_df)
    print("üìÅ Data successfully inserted to feature store")
else:
    if aq_df.empty:
        print("‚ö†Ô∏è  No clean air quality data to insert")
    if weather_df.empty:
        print("‚ö†Ô∏è  No clean weather data to insert")

In [None]:
print(weather_df['date'].unique())

In [None]:
print(weather_df.dtypes)

In [None]:
print(aq_df.dtypes)

In [None]:
print(air_quality_fg.read().dtypes)