# 2. Feature Pipeline

## 2.1. Setup

### 2.1.1. Import Libraries

In [1]:
# Standard imports
import os
from pathlib import Path
import sys
import json
import time
from datetime import date, datetime, timedelta
import warnings

warnings.filterwarnings("ignore", module="IPython")
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import numpy as np
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

today = datetime.today().date()

Project root dir: c:\Users\krist\Documents\GitHub\pm25


### 2.1.2. Load Settings and Initialize Hopsworks Connection

In [2]:
# ---------------------------------------------------------
# 1. Detect environment (local, Jupyter, or Hopsworks Job)
# ---------------------------------------------------------
RUNNING_IN_HOPSWORKS_JOB = "HOPSWORKS_JOB_ID" in os.environ

if RUNNING_IN_HOPSWORKS_JOB:
    # Running inside a Hopsworks Job ‚Üí Vault secrets available
    project = hopsworks.login()
    secrets_api = hopsworks.get_secrets_api()

    os.environ["HOPSWORKS_API_KEY"] = secrets_api.get_secret("HOPSWORKS_API_KEY").value
    os.environ["AQICN_API_KEY"] = secrets_api.get_secret("AQICN_API_KEY").value
    os.environ["GH_PAT"] = secrets_api.get_secret("GH_PAT").value
    os.environ["GH_USERNAME"] = secrets_api.get_secret("GH_USERNAME").value

else:
    # Running locally or in Hopsworks Jupyter ‚Üí use .env
    from dotenv import load_dotenv
    load_dotenv()

# ---------------------------------------------------------
# 2. Load Pydantic settings (now environment is ready)
# ---------------------------------------------------------
settings = config.HopsworksSettings()

HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# ---------------------------------------------------------
# 3. Login to Hopsworks using the API key
# ---------------------------------------------------------
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

HopsworksSettings initialized!
2026-01-26 09:47:35,021 INFO: Initializing external client
2026-01-26 09:47:35,022 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-26 09:47:37,158 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


### 2.1.3. Repository management

In [3]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

Repository exists at c:\Users\krist\Documents\GitHub\pm25\notebooks\pm25-forecast-openmeteo-aqicn


### 2.1.3. Configure API Keys and Secrets

In [4]:
secrets = hopsworks.get_secrets_api()

try:
    secrets.get_secret("AQICN_API_KEY")
except:
    secrets.create_secret("AQICN_API_KEY", settings.AQICN_API_KEY.get_secret_value())

## 2.2. Get Feature Groups

In [5]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.3. Load Sensor Locations from Feature Group

In [6]:
# Load data from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

# Build sensor location dictionary: sensor_id -> (lat, lon, city, street, country, aqicn_url)
sensor_locations = {}
existing_aq_data = air_quality_fg.read()
existing_sensors = set(existing_aq_data["sensor_id"].unique())
print(f"üìã Found {len(existing_sensors)} sensors in feature store")


for _, row in existing_aq_data[
    ["sensor_id", "latitude", "longitude", "city", "street", "country", "aqicn_url"]
].drop_duplicates(subset=["sensor_id"]).iterrows():
    
    sensor_locations[row["sensor_id"]] = {
        "latitude": row["latitude"],
        "longitude": row["longitude"],
        "city": row["city"],
        "street": row["street"],
        "country": row["country"],
        "aqicn_url": row["aqicn_url"]
    }


# # Build location dict
# for _, row in existing_aq_data[["sensor_id", "latitude", "longitude", "city", "street", "country", "aqicn_url"]].drop_duplicates(subset=["sensor_id"]).iterrows():
#     sensor_locations[row["sensor_id"]] = (
#         row["latitude"], 
#         row["longitude"], 
#         row["city"], 
#         row["street"], 
#         row["country"],
#         row["aqicn_url"]
#     )
print(f"üìç Loaded locations for {len(sensor_locations)} existing sensors")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.15s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.14s) 
üìã Found 103 sensors in feature store
üìç Loaded locations for 103 existing sensors


## 2.4. Data Collection
Fetch today's air quality data and weather forecasts, format data to match feature group schemas.

### 2.4.1. Load Historical Air Quality Data (Last 4 Days)

In [7]:
print(f"üîç Processing {len(sensor_locations)} sensor locations.")
historical_start = today - timedelta(days=4)

try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_start_dt = pd.to_datetime(historical_start)
        
        historical_df = historical_df[
            (historical_df["date"] >= historical_start_dt) & 
            (historical_df["date"] <= today_dt) 
        ][["date", "sensor_id", "pm25"]]
        
        historical_df = historical_df[historical_df["sensor_id"].isin(sensor_locations.keys())]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

üîç Processing 103 sensor locations.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.18s) 


### 2.4.2. Identify Missing Dates for Backfill

In [8]:
existing_dates = air_quality_fg.read()["date"].dt.date.unique()

today = datetime.today().date()
start_date = today - timedelta(days=7)  # Check last 7 days for missing data

expected_dates = pd.date_range(start=start_date, end=today, freq="D").date
missing_dates = [d for d in expected_dates if d not in existing_dates]

# print(f"üìÖ Missing dates to backfill: {missing_dates}")
formatted = ", ".join(d.isoformat() for d in missing_dates)
print(f"üìÖ Missing dates to backfill: {formatted}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.75s) 
üìÖ Missing dates to backfill: 2026-01-19, 2026-01-20, 2026-01-21, 2026-01-22, 2026-01-23, 2026-01-24, 2026-01-25, 2026-01-26


### 2.4.3. Preparations

In [9]:
# Prepare historical data window
historical_cutoff = pd.to_datetime(min(missing_dates)) - pd.Timedelta(days=3)
historical = air_quality_fg.read()
historical["date"] = pd.to_datetime(historical["date"]).dt.tz_localize(None)
historical = historical [historical["date"] >= historical_cutoff]

# Track existing sensor-date pairs
existing = historical[["sensor_id", "date"]].copy()
existing["date_only"] = existing["date"].dt.date
existing_keys = set(zip(existing["sensor_id"], existing["date_only"]))

# Initialize data containers
all_aq_rows = [historical]
all_weather_rows = []

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (8.26s) 


### 2.4.4. Fetch Missing Air Quality Data

In [10]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching air quality for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in missing_dates:
        if (sensor_id, day) in existing_keys:
            continue

        try:
            aq_df = fetchers.get_pm25(
                meta["aqicn_url"], meta["country"], meta["city"],
                meta["street"], day, AQICN_API_KEY
            )

            if aq_df.empty or aq_df["pm25"].isna().all():
                continue

            aq_df["sensor_id"] = int(sensor_id)
            aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce")
            # aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
            aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.normalize()

            # Add metadata
            aq_df["city"] = meta["city"]
            aq_df["street"] = meta["street"]
            aq_df["country"] = meta["country"]
            aq_df["aqicn_url"] = meta["aqicn_url"]
            aq_df["latitude"] = meta["latitude"]
            aq_df["longitude"] = meta["longitude"]

            aq_df = aq_df.drop(columns=["url"], errors="ignore")

            all_aq_rows.append(aq_df)

        except Exception as e:
            print(f"‚ùå Air quality for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_aq_rows)} air quality dataframes")


Fetching air quality for sensor 60853, 1/103
Fetching air quality for sensor 59497, 2/103
Fetching air quality for sensor 59650, 3/103
Fetching air quality for sensor 112672, 4/103
Fetching air quality for sensor 60889, 5/103
Fetching air quality for sensor 60076, 6/103
Fetching air quality for sensor 58921, 7/103
Fetching air quality for sensor 84085, 8/103
Fetching air quality for sensor 89584, 9/103
Fetching air quality for sensor 198559, 10/103
Fetching air quality for sensor 149242, 11/103
Fetching air quality for sensor 105325, 12/103
Fetching air quality for sensor 78529, 13/103
Fetching air quality for sensor 88876, 14/103
Fetching air quality for sensor 65272, 15/103
Fetching air quality for sensor 77488, 16/103
Fetching air quality for sensor 351115, 17/103
Fetching air quality for sensor 122302, 18/103
Fetching air quality for sensor 196735, 19/103
Fetching air quality for sensor 69724, 20/103
Fetching air quality for sensor 60859, 21/103
Fetching air quality for sensor 6514

### 2.4.5. Fetch Missing Weather Forecast Data

In [11]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching weather for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in missing_dates:
        try:
            weather_df = fetchers.get_weather_forecast(
                sensor_id=sensor_id,
                latitude=meta["latitude"],
                longitude=meta["longitude"],
                start_date=day,
                end_date=day + timedelta(days=6)
            )

            if weather_df.empty:
                continue

            weather_df["sensor_id"] = int(sensor_id)
            weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.normalize()
            # weather_df["date"] = (
            #     pd.to_datetime(weather_df["date"])
            #     .dt.normalize()
            #     .dt.tz_localize(None)
            # )

            all_weather_rows.append(weather_df)

        except Exception as e:
            print(f"‚ùå Weather for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_weather_rows)} weather dataframes")


Fetching weather for sensor 60853, 1/103
Fetching weather for sensor 59497, 2/103
Fetching weather for sensor 59650, 3/103
Fetching weather for sensor 112672, 4/103
Fetching weather for sensor 60889, 5/103
Fetching weather for sensor 60076, 6/103
Fetching weather for sensor 58921, 7/103
Fetching weather for sensor 84085, 8/103
Fetching weather for sensor 89584, 9/103
Fetching weather for sensor 198559, 10/103
Fetching weather for sensor 149242, 11/103
Fetching weather for sensor 105325, 12/103
Fetching weather for sensor 78529, 13/103
Fetching weather for sensor 88876, 14/103
Fetching weather for sensor 65272, 15/103
Fetching weather for sensor 77488, 16/103
Fetching weather for sensor 351115, 17/103
Fetching weather for sensor 122302, 18/103
Fetching weather for sensor 196735, 19/103
Fetching weather for sensor 69724, 20/103
Fetching weather for sensor 60859, 21/103
Fetching weather for sensor 65146, 22/103
Fetching weather for sensor 57421, 23/103
Fetching weather for sensor 194215, 

### 2.4.9. Clean and Align Data Structure

In [12]:
cleaned_aq_rows = []

engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
base_cols = [c for c in historical.columns if c not in engineered_cols]

for i, df in enumerate(all_aq_rows):
    if df.empty or "pm25" not in df.columns or df["pm25"].isna().all():
        print(f"‚ö†Ô∏è Skipping empty or invalid df[{i}]")
        continue

    df["date"] = pd.to_datetime(df["date"]).dt.normalize().dt.tz_localize(None)

    # Skip if too few expected columns are present
    if len(set(df.columns) & set(base_cols)) < 3:
        print(f"‚ö†Ô∏è Skipping malformed df[{i}] with columns: {list(df.columns)}")
        continue

    # Align to base columns only (no engineered features yet)
    aligned = df.reindex(columns=base_cols, fill_value=np.nan)

    # Final sanity check
    if aligned.shape[1] != len(base_cols):
        print(f"‚ùå Still malformed after alignment: df[{i}] shape={aligned.shape}")
        continue

    # Force dtype alignment to match historical (for base columns only)
    for col in base_cols:
        if col in historical.columns:
            try:
                aligned[col] = aligned[col].astype(historical[col].dtype, errors="raise")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not cast column '{col}' in df[{i}]: {e}")
                continue

    cleaned_aq_rows.append(aligned)

print(f"üìã Cleaned {len(cleaned_aq_rows)} air quality dataframes")
print(f"üìã Using base columns (excluding engineered features): {len(base_cols)} columns")

‚ö†Ô∏è Skipping empty or invalid df[0]
üìã Cleaned 824 air quality dataframes
üìã Using base columns (excluding engineered features): 9 columns


### 2.4.10. Combine and Clean Weather Data

In [13]:
if all_weather_rows:
    all_weather = pd.concat(all_weather_rows, ignore_index=True)
    all_weather = all_weather.sort_values(["sensor_id", "date"]).reset_index(drop=True)
    all_weather["date"] = pd.to_datetime(all_weather["date"]).dt.tz_localize(None)
    
    # Remove duplicates
    all_weather = all_weather.drop_duplicates(subset=["sensor_id", "date"], keep="first")
    
    print(f"üå§Ô∏è Total weather records: {len(all_weather)}")
    print(f"üìÖ Weather date range: {all_weather['date'].min()} to {all_weather['date'].max()}")
else:
    all_weather = pd.DataFrame()
    print("‚ö†Ô∏è No weather data collected")

üå§Ô∏è Total weather records: 1442
üìÖ Weather date range: 2026-01-19 00:00:00 to 2026-02-01 00:00:00


## 2.5. Combine Data and Add Engineered Features

In [14]:
# from utils import feature_engineering

# locations = feature_engineering.build_sensor_location_map(df, sensor_locations)
# print("DEBUG LOCATIONS:", locations)

In [15]:
# Drop engineered columns from historical data before combining
engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
historical_base = historical.drop(columns=engineered_cols, errors="ignore")

# Combine data
all_aq = pd.concat([historical_base, *cleaned_aq_rows], ignore_index=True)
all_aq = all_aq.sort_values(["sensor_id", "date"]).reset_index(drop=True)
all_aq["date"] = pd.to_datetime(all_aq["date"]).dt.normalize().dt.tz_localize(None)

# Remove duplicates: keep the first occurrence of each sensor_id + date combination
all_aq = all_aq.drop_duplicates(subset=["sensor_id", "date"], keep="first").reset_index(drop=True)

print(f"üìä Total records after deduplication: {len(all_aq)}")
print(f"üìä Unique sensors: {all_aq['sensor_id'].nunique()}")
print(f"üìä Date range: {all_aq['date'].min()} to {all_aq['date'].max()}")

# Add engineered features
all_aq = feature_engineering.add_rolling_window_feature(all_aq, window_days=3)
all_aq = feature_engineering.add_lagged_features(all_aq, lags=[1, 2, 3])

# Pass sensor_locations dict to nearby sensor feature
all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, sensor_locations, n_closest=3)

üìä Total records after deduplication: 824
üìä Unique sensors: 103
üìä Date range: 2026-01-19 00:00:00 to 2026-01-26 00:00:00


## 2.6. Insert Data to Feature Groups

### 2.6.1. Batch Insert Air Quality Data by Date

In [16]:
for day in missing_dates:
    day_rows = all_aq[all_aq["date"].dt.date == day].copy()
    day_rows = day_rows.dropna(subset=["pm25"])

    engineered_cols = [c for c in day_rows.columns if "lag" in c or "rolling" in c or "nearby" in c]
    day_rows = day_rows.dropna(subset=engineered_cols, how="any")

    if not day_rows.empty:
        # Convert types to match feature group schema
        day_rows = day_rows.astype({
            "sensor_id": "int32",
            "pm25": "float64",
            "pm25_lag_1d": "float64",
            "pm25_lag_2d": "float64",
            "pm25_lag_3d": "float64",
            "pm25_rolling_3d": "float64",
            "pm25_nearby_avg": "float64",
            "city": "string",
            "street": "string",
            "country": "string",
            "aqicn_url": "string",
            "latitude": "float64",
            "longitude": "float64",
        })
        
        # Ensure correct column order
        fg_columns = [f.name for f in air_quality_fg.features]
        day_rows = day_rows[fg_columns]
        
        air_quality_fg.insert(day_rows)
        print(f"‚úÖ Inserted {len(day_rows)} rows for {day}")
    else:
        print(f"‚ö†Ô∏è No valid rows for {day}")

‚ö†Ô∏è No valid rows for 2026-01-19
‚ö†Ô∏è No valid rows for 2026-01-20
‚ö†Ô∏è No valid rows for 2026-01-21
2026-01-26 10:29:47,630 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:01 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 103 rows for 2026-01-22
2026-01-26 10:29:56,773 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:01 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 103 rows for 2026-01-23
2026-01-26 10:30:04,992 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:00 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 103 rows for 2026-01-24
2026-01-26 10:30:12,497 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:00 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 103 rows for 2026-01-25
2026-01-26 10:30:20,444 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 103/103 | Elapsed Time: 00:01 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 103 rows for 2026-01-26


### 2.6.2. Verify Air Quality Insertion

In [17]:
print(all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_rolling_3d", "pm25_nearby_avg"]])

     sensor_id       date  pm25  pm25_lag_1d  pm25_rolling_3d  pm25_nearby_avg
7        57421 2026-01-26  10.0         10.0             10.0        11.333333
15       58666 2026-01-26   7.0          7.0              7.0        10.666667
23       58909 2026-01-26  13.0         13.0             13.0        20.666667
31       58912 2026-01-26   7.0          7.0              7.0        12.666667
39       58921 2026-01-26  13.0         13.0             13.0        11.333333
..         ...        ...   ...          ...              ...              ...
791     494275 2026-01-26   6.0          6.0              6.0         3.666667
799     497266 2026-01-26  12.0         12.0             12.0         5.333333
807     533086 2026-01-26  14.0         14.0             14.0        11.333333
815     556792 2026-01-26  10.0         10.0             10.0        11.000000
823     562600 2026-01-26  13.0         13.0             13.0         7.333333

[103 rows x 6 columns]


In [18]:
print(all_aq[all_aq["date"].dt.date == today - timedelta(days=1)])

     sensor_id       date  pm25                        city  \
6        57421 2026-01-25  10.0                 Johannehill   
14       58666 2026-01-25   7.0                      √Ñngeby   
22       58909 2026-01-25  13.0                       Slaka   
30       58912 2026-01-25   7.0                    H√§gern√§s   
38       58921 2026-01-25  13.0  Skarpn√§cks stadsdelsomr√•de   
..         ...        ...   ...                         ...   
790     494275 2026-01-25   6.0                      Stavre   
798     497266 2026-01-25  12.0                  Skellefte√•   
806     533086 2026-01-25  14.0                        Berg   
814     556792 2026-01-25  10.0                  Norrk√∂ping   
822     562600 2026-01-25  13.0                       Solna   

                 street country                            aqicn_url  \
6                  Ubby  Sweden   https://api.waqi.info/feed/A57421/   
14        Jupitersv√§gen  Sweden   https://api.waqi.info/feed/A58666/   
22        Tr√∂skare

In [19]:
# Debug: Check nearby sensor calculation
print("üîç Debugging nearby sensor feature:")
print(f"Total sensors in all_aq: {all_aq['sensor_id'].nunique()}")
print(f"Total sensors in sensor_locations: {len(sensor_locations)}")

# Check if pm25_lag_1d has values
lag_stats = all_aq[all_aq['date'].dt.date == today]['pm25_lag_1d'].describe()
print(f"\npm25_lag_1d stats for today:")
print(lag_stats)

# Check one sensor specifically
test_sensor = 58666
print(f"\nüîç Checking sensor {test_sensor}:")
sensor_data = all_aq[all_aq['sensor_id'] == test_sensor].tail(5)
print(sensor_data[['sensor_id', 'date', 'pm25', 'pm25_lag_1d', 'pm25_nearby_avg']])

# Check if this sensor has neighbors in the locations dict
if test_sensor in sensor_locations:
    print(f"\nSensor {test_sensor} is in sensor_locations")
    print(f"Lat/Lon: {sensor_locations[test_sensor]['latitude']}, {sensor_locations[test_sensor]['longitude']}")
else:
    print(f"\n‚ö†Ô∏è Sensor {test_sensor} NOT in sensor_locations!")


üîç Debugging nearby sensor feature:
Total sensors in all_aq: 103
Total sensors in sensor_locations: 103

pm25_lag_1d stats for today:
count    103.000000
mean      11.883495
std       18.600180
min        0.000000
25%        5.000000
50%        8.000000
75%       13.000000
max      151.000000
Name: pm25_lag_1d, dtype: float64

üîç Checking sensor 58666:
    sensor_id       date  pm25  pm25_lag_1d  pm25_nearby_avg
11      58666 2026-01-22   7.0          7.0        10.666667
12      58666 2026-01-23   7.0          7.0        10.666667
13      58666 2026-01-24   7.0          7.0        10.666667
14      58666 2026-01-25   7.0          7.0        10.666667
15      58666 2026-01-26   7.0          7.0        10.666667

Sensor 58666 is in sensor_locations
Lat/Lon: 59.98333, 17.73333


In [20]:
# # Debug: Manually trace the nearby sensor calculation for one sensor
# from utils.feature_engineering import build_sensor_location_map, compute_closest_sensors

# test_sensor = 58666
# locations = build_sensor_location_map(all_aq, sensor_locations)
# closest_map = compute_closest_sensors(locations, n_closest=3)

# print(f"üîç Closest sensors to {test_sensor}:")
# neighbors = closest_map.get(test_sensor, [])
# print(f"Neighbors: {neighbors}")

# if neighbors:
#     # Get neighbor data
#     neighbor_df = all_aq[all_aq['sensor_id'].isin(neighbors)][['date', 'pm25_lag_1d']]
#     print(f"\nüìä Neighbor data (showing last 10):")
#     print(neighbor_df.tail(10))
    
#     # Group by date
#     neighbor_avg = neighbor_df.groupby('date')['pm25_lag_1d'].mean().reset_index()
#     print(f"\nüìä Neighbor average by date:")
#     print(neighbor_avg.tail(10))
    
#     # Get sensor data and merge
#     sensor_data = all_aq[all_aq['sensor_id'] == test_sensor]
#     print(f"\nüìä Sensor {test_sensor} data:")
#     print(sensor_data[['sensor_id', 'date', 'pm25_lag_1d']].tail(5))
    
#     merged = sensor_data.merge(neighbor_avg, on='date', how='left')
#     print(f"\nüìä After merge:")
#     print(merged[['sensor_id', 'date', 'pm25_lag_1d_x', 'pm25_lag_1d_y']].tail(5))
#     print(f"\n‚úÖ Column 'pm25_lag_1d_y' exists: {'pm25_lag_1d_y' in merged.columns}")


In [21]:
# # Re-import the fixed module
# import importlib
# importlib.reload(feature_engineering)

# # Re-apply the nearby sensor feature with the fixed function
# print("üîÑ Re-calculating nearby sensor averages with fixed function...")
# all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, sensor_locations, n_closest=3)

# print("\n‚úÖ Re-calculated nearby sensor averages")
# print("\nüìä Sample data for today:")
# sample = all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_nearby_avg"]].head(10)
# print(sample)

# print(f"\nüìä Nearby avg stats:")
# print(all_aq["pm25_nearby_avg"].describe())
# print(f"\nüìä Non-null nearby averages: {all_aq['pm25_nearby_avg'].notna().sum()} / {len(all_aq)}")


In [22]:
# # Check today's data more thoroughly
# today_data = all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_nearby_avg"]]
# print(f"üìä Today's data ({today}):")
# print(f"Total records: {len(today_data)}")
# print(f"Records with nearby_avg: {today_data['pm25_nearby_avg'].notna().sum()}")
# print(f"Records without nearby_avg: {today_data['pm25_nearby_avg'].isna().sum()}")

# print("\nüîç Sensors still missing nearby_avg on today:")
# missing_nearby = today_data[today_data['pm25_nearby_avg'].isna()]
# if len(missing_nearby) > 0:
#     print(missing_nearby.head(10))
#     print(f"\n...and {max(0, len(missing_nearby) - 10)} more")
# else:
#     print("None! All sensors have nearby averages ‚úÖ")


### 2.6.3. Batch Insert Weather Forecast Data

In [23]:
if not all_weather.empty:
    # Convert types to match feature group schema
    all_weather = all_weather.astype({
        "sensor_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    # Ensure correct column order
    weather_fg_columns = [f.name for f in weather_fg.features]
    all_weather = all_weather[weather_fg_columns]
    
    # Insert in smaller batches to avoid connection issues
    batch_size = 100
    total_inserted = 0
    
    for i in range(0, len(all_weather), batch_size):
        batch = all_weather.iloc[i:i+batch_size]
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                weather_fg.insert(batch)
                total_inserted += len(batch)
                print(f"‚úÖ Weather batch {i//batch_size + 1}: {len(batch)} records (total: {total_inserted}/{len(all_weather)})")
                break
            except (ProtocolError, ConnectionError, TimeoutError, KafkaException) as e:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"‚ö†Ô∏è Connection error on weather batch {i//batch_size + 1}, retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Failed weather batch {i//batch_size + 1}")
                    failed_file = f"{root_dir}/failed_weather_batch_{today}_{i}.csv"
                    batch.to_csv(failed_file, index=False)
                    print(f"üíæ Saved to {failed_file}")
    
    print(f"üå§Ô∏è Total weather inserted: {total_inserted}/{len(all_weather)} records")
else:
    print("‚ö†Ô∏è No weather data to insert")

2026-01-26 10:30:29,186 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_1_offline_fg_materialization/executions
‚úÖ Weather batch 1: 100 records (total: 100/1442)
2026-01-26 10:30:43,730 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:00 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/weather_1_offline_fg_materialization/config_1768459788862) to trigger the materialization job again.

‚úÖ Weather batch 2: 100 records (total: 200/1442)
2026-01-26 10:30:51,315 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 3: 100 records (total: 300/1442)
2026-01-26 10:30:59,887 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 4: 100 records (total: 400/1442)
2026-01-26 10:31:08,669 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 5: 100 records (total: 500/1442)
2026-01-26 10:31:17,290 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 6: 100 records (total: 600/1442)
2026-01-26 10:31:26,234 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 7: 100 records (total: 700/1442)
2026-01-26 10:31:34,793 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 8: 100 records (total: 800/1442)
2026-01-26 10:31:43,313 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 9: 100 records (total: 900/1442)
2026-01-26 10:31:51,783 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 10: 100 records (total: 1000/1442)
2026-01-26 10:32:00,010 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 11: 100 records (total: 1100/1442)
2026-01-26 10:32:08,620 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 12: 100 records (total: 1200/1442)
2026-01-26 10:32:17,241 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 13: 100 records (total: 1300/1442)
2026-01-26 10:32:26,012 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚úÖ Weather batch 14: 100 records (total: 1400/1442)
2026-01-26 10:32:34,581 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 42/42 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_1_offline_fg_materialization/executions
‚úÖ Weather batch 15: 42 records (total: 1442/1442)
üå§Ô∏è Total weather inserted: 1442/1442 records


### 2.6.4. Print Processing Summary

In [24]:
# print(f"\nüìä Summary: ‚úÖ {successful} successful, ‚è≠Ô∏è {skipped} skipped, ‚ùå {failed} failed")

## 2.7. Inspect Inserted Data

In [25]:
if 'all_aq' in locals() and not all_aq.empty:
    print(f"‚úÖ Air quality records inserted: {len(all_aq)}")
    print("\nüìã Sample air quality data:")
    print(all_aq.head())
    print("\nüîß Air quality data types:")
    print(all_aq.dtypes)
    print("\nüìÖ Date range:")
    print(f"From {all_aq['date'].min()} to {all_aq['date'].max()}")

if 'all_weather' in locals() and not all_weather.empty:
    print(f"\nüå§Ô∏è Weather records inserted: {len(all_weather)}")
    print("\nüìã Sample weather data:")
    print(all_weather.head())
    print("\nüîß Weather data types:")
    print(all_weather.dtypes)
    print("\nüìÖ Unique weather dates:")
    print(all_weather['date'].unique())

‚úÖ Air quality records inserted: 824

üìã Sample air quality data:
   sensor_id       date  pm25         city street country  \
0      57421 2026-01-19  10.0  Johannehill   Ubby  Sweden   
1      57421 2026-01-20  10.0  Johannehill   Ubby  Sweden   
2      57421 2026-01-21  10.0  Johannehill   Ubby  Sweden   
3      57421 2026-01-22  10.0  Johannehill   Ubby  Sweden   
4      57421 2026-01-23  10.0  Johannehill   Ubby  Sweden   

                            aqicn_url  latitude  longitude  pm25_rolling_3d  \
0  https://api.waqi.info/feed/A57421/      62.0       15.0              NaN   
1  https://api.waqi.info/feed/A57421/      62.0       15.0             10.0   
2  https://api.waqi.info/feed/A57421/      62.0       15.0             10.0   
3  https://api.waqi.info/feed/A57421/      62.0       15.0             10.0   
4  https://api.waqi.info/feed/A57421/      62.0       15.0             10.0   

   pm25_lag_1d  pm25_lag_2d  pm25_lag_3d  pm25_nearby_avg  
0          NaN          NaN  