# 2. Feature Pipeline

## 2.1. Setup

### 2.1.1. Import Libraries

In [5]:
# Standard imports
import os
from pathlib import Path
import sys
import json
import time
from datetime import date, datetime, timedelta
from dotenv import load_dotenv

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import numpy as np
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from confluent_kafka import KafkaException
from hsfs.client.exceptions import RestAPIError
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from scipy.spatial.distance import cdist

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

today = datetime.today().date()

Project root dir: /Users/kristina/Github/pm25-unlinked


### 2.1.2. Load Settings and Initialize Hopsworks Connection

In [None]:
def detect_environment():
    if (
        "HOPSWORKS_JOB_ID" in os.environ
        or "HOPSWORKS_PROJECT_ID" in os.environ
        or "HOPSWORKS_JOB_NAME" in os.environ
    ):
        return "job"

    cwd = os.getcwd()
    if cwd.startswith("/hopsfs/Jupyter"):
        return "jupyter"

    return "local"

env = detect_environment()
print(f"Detected environment: {env}")

if env in ("job", "jupyter"):
    project = hopsworks.login()
    secrets_api = hopsworks.get_secrets_api()

    for key in ["HOPSWORKS_API_KEY", "AQICN_API_KEY", "GH_PAT", "GH_USERNAME"]:
        os.environ[key] = secrets_api.get_secret(key).value

else:
    env_path = os.path.join(root_dir, '.env')
    load_dotenv(env_path)
    print(f"‚úÖ Loaded .env from: {env_path}")

settings = config.HopsworksSettings()

HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

HOPSWORKS_ENDPOINT = os.environ.get('HOPSWORKS_ENDPOINT')
if HOPSWORKS_ENDPOINT:
    HOPSWORKS_HOST = HOPSWORKS_ENDPOINT.replace('https://', '').replace('http://', '')
    print(f"üîó Using Hopsworks host: {HOPSWORKS_HOST}")
else:
    HOPSWORKS_HOST = None
    print("üîó Using default Hopsworks host")

print(f"üîë API key loaded: {HOPSWORKS_API_KEY[:20]}...{HOPSWORKS_API_KEY[-10:]}")

if HOPSWORKS_HOST:
    project = hopsworks.login(host=HOPSWORKS_HOST, api_key_value=HOPSWORKS_API_KEY)
else:
    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

fs = project.get_feature_store()

print("Environment initialized and Hopsworks connected!")
print(project.name)

Detected environment: local
‚úÖ Loaded .env from: /Users/kristina/Github/pm25-unlinked/.env
üîó Using Hopsworks host: eu-west.cloud.hopsworks.ai
üîë API key loaded: vcbtKUr5zSFtHtin.PGB...PIVn8Zeqjj
2026-02-19 09:44:57,931 INFO: Initializing external client
2026-02-19 09:44:57,932 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.7) by running 'pip install hopsworks==4.7.*'







2026-02-19 09:44:59,276 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/19575
Environment initialized and Hopsworks connected!
pm25_sweden


### 2.1.3. Repository management

In [7]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

üíª URL: /Users/kristina/Github/pm25-unlinked
   ‚úÖ Configured git remote with authentication for pm25-unlinked


### 2.1.3. Configure API Keys and Secrets

In [8]:
secrets = hopsworks.get_secrets_api()

try:
    secrets.get_secret("AQICN_API_KEY")
except:
    secrets.create_secret("AQICN_API_KEY", settings.AQICN_API_KEY.get_secret_value())

## 2.2. Get Feature Groups

In [9]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.3. Load Sensor Locations from Feature Group

In [10]:
# Load data from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)


sensor_locations = metadata.get_sensor_locations_dict(air_quality_fg)
print(f"üìç Loaded locations for {len(sensor_locations)} existing sensors")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (42.02s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (37.36s) 
üìç Loaded locations for 102 existing sensors


## 2.4. Data Collection
Fetch today's air quality data and weather forecasts, format data to match feature group schemas.

### 2.4.1. Load Historical Air Quality Data (Last 4 Days)

In [11]:
print(f"üîç Processing {len(sensor_locations)} sensor locations.")
historical_start = today - timedelta(days=4)

try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_start_dt = pd.to_datetime(historical_start)
        
        historical_df = historical_df[
            (historical_df["date"] >= historical_start_dt) & 
            (historical_df["date"] <= today_dt) 
        ][["date", "sensor_id", "pm25"]]
        
        historical_df = historical_df[historical_df["sensor_id"].isin(sensor_locations.keys())]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

üîç Processing 102 sensor locations.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (36.60s) 


### 2.4.2. Identify Missing Dates for Backfill

In [12]:
aq_data = air_quality_fg.read()
existing_dates = pd.to_datetime(aq_data["date"]).dt.date.unique()

# Debug: Show what dates exist in the feature store
if len(existing_dates) > 0:
    print(f"üìÖ Feature store date range: {min(existing_dates)} to {max(existing_dates)}")
    print(f"üìÖ Total unique dates in store: {len(existing_dates)}")
else:
    print("‚ö†Ô∏è No dates found in feature store")

today = datetime.today().date()
start_date = today - timedelta(days=7)  # Check last 7 days for missing data

# Generate expected dates and convert to set for faster lookup
expected_dates = set(pd.date_range(start=start_date, end=today, freq="D").date)
existing_dates_set = set(existing_dates)

# Find missing dates
original_missing_dates = sorted(list(expected_dates - existing_dates_set))

print(f"\nüîç Checking for missing dates between {start_date} and {today}")
print(f"   Expected dates: {len(expected_dates)}")
print(f"   Existing dates in that range: {len(expected_dates & existing_dates_set)}")
print(f"   Missing dates: {len(original_missing_dates)}")

# Separate: dates to fetch vs dates to insert
dates_to_insert = original_missing_dates.copy()  # Only insert the actual missing dates
dates_to_fetch = original_missing_dates.copy()   # Fetch missing dates + buffer

# Add 3 buffer days before first missing date to ensure we can calculate lag features
if original_missing_dates:
    earliest_missing = min(original_missing_dates)
    buffer_dates = [earliest_missing - timedelta(days=i) for i in range(1, 4)]
    # Only add buffer dates that aren't already in existing_dates
    buffer_dates = [d for d in buffer_dates if d not in existing_dates_set]
    dates_to_fetch = sorted(buffer_dates + dates_to_fetch)

formatted = ", ".join(d.isoformat() for d in dates_to_fetch) if dates_to_fetch else "None"
insert_formatted = ", ".join(d.isoformat() for d in dates_to_insert) if dates_to_insert else "None"
print(f"\nüìÖ Dates to fetch: {formatted}")
print(f"üìÖ Dates to insert: {insert_formatted}")

# Exit early if no missing dates
if not dates_to_fetch:
    print("\n‚úÖ No missing dates found. Feature store is up to date!")
    print("   The feature pipeline will continue without fetching new data.")
    dates_to_insert = []
    dates_to_fetch = []

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (36.34s) 
üìÖ Feature store date range: 2019-12-09 to 2025-12-19
üìÖ Total unique dates in store: 2201

üîç Checking for missing dates between 2026-02-12 and 2026-02-19
   Expected dates: 8
   Existing dates in that range: 0
   Missing dates: 8

üìÖ Dates to fetch: 2026-02-09, 2026-02-10, 2026-02-11, 2026-02-12, 2026-02-13, 2026-02-14, 2026-02-15, 2026-02-16, 2026-02-17, 2026-02-18, 2026-02-19
üìÖ Dates to insert: 2026-02-12, 2026-02-13, 2026-02-14, 2026-02-15, 2026-02-16, 2026-02-17, 2026-02-18, 2026-02-19


### 2.4.3. Preparations

In [13]:
# Load historical data
historical = air_quality_fg.read()
historical["date"] = pd.to_datetime(historical["date"]).dt.tz_localize(None)

# Skip if no dates to fetch
if not dates_to_fetch:
    print("\n‚è≠Ô∏è  Skipping data preparation - no missing dates")
    all_aq_rows = []
    all_weather_rows = []
else:
    print(f"\nüìã Preparing to fetch data for {len(dates_to_fetch)} dates")
    
    # Filter historical data to relevant window
    historical_cutoff = pd.to_datetime(min(dates_to_fetch)) - pd.Timedelta(days=3)
    historical = historical[historical["date"] >= historical_cutoff]

    # Track existing sensor-date pairs
    existing = historical[["sensor_id", "date"]].copy()
    existing["date_only"] = existing["date"].dt.date
    existing_keys = set(zip(existing["sensor_id"], existing["date_only"]))

    # Initialize data containers
    all_aq_rows = [historical]
    all_weather_rows = []

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (36.97s) 

üìã Preparing to fetch data for 11 dates


### 2.4.4. Fetch Missing Air Quality Data

In [14]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching air quality for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in dates_to_fetch:
        if (sensor_id, day) in existing_keys:
            continue

        try:
            aq_df = fetchers.get_pm25(
                meta["aqicn_url"], meta["country"], meta["city"],
                meta["street"], day, AQICN_API_KEY
            )

            if aq_df.empty or aq_df["pm25"].isna().all():
                continue

            aq_df["sensor_id"] = int(sensor_id)
            aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce")
            # aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
            aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.normalize()

            # Add metadata
            aq_df["city"] = meta["city"]
            aq_df["street"] = meta["street"]
            aq_df["country"] = meta["country"]
            aq_df["aqicn_url"] = meta["aqicn_url"]
            aq_df["latitude"] = meta["latitude"]
            aq_df["longitude"] = meta["longitude"]

            aq_df = aq_df.drop(columns=["url"], errors="ignore")

            all_aq_rows.append(aq_df)

        except Exception as e:
            print(f"‚ùå Air quality for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_aq_rows)} air quality dataframes")

Fetching air quality for sensor 59899, 1/102
Fetching air quality for sensor 105325, 2/102
Fetching air quality for sensor 196735, 3/102
Fetching air quality for sensor 462457, 4/102
Fetching air quality for sensor 61045, 5/102
Fetching air quality for sensor 497266, 6/102
Fetching air quality for sensor 59593, 7/102
Fetching air quality for sensor 62566, 8/102
Fetching air quality for sensor 59887, 9/102
Fetching air quality for sensor 163156, 10/102
Fetching air quality for sensor 107110, 11/102
Fetching air quality for sensor 63637, 12/102
Fetching air quality for sensor 58666, 13/102
Fetching air quality for sensor 88876, 14/102
Fetching air quality for sensor 65146, 15/102
Fetching air quality for sensor 129124, 16/102
Fetching air quality for sensor 59893, 17/102
Fetching air quality for sensor 87319, 18/102
Fetching air quality for sensor 81505, 19/102
Fetching air quality for sensor 113542, 20/102
Fetching air quality for sensor 409513, 21/102
Fetching air quality for sensor 65

### 2.4.5. Fetch Missing Weather Forecast Data

In [15]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching weather for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in dates_to_fetch:
        try:
            weather_df = fetchers.get_weather_forecast(
                sensor_id=sensor_id,
                latitude=meta["latitude"],
                longitude=meta["longitude"],
                start_date=day,
                end_date=day + timedelta(days=6)
            )

            if weather_df.empty:
                continue

            weather_df["sensor_id"] = int(sensor_id)
            weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.normalize()

            all_weather_rows.append(weather_df)

        except Exception as e:
            print(f"‚ùå Weather for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_weather_rows)} weather dataframes")

Fetching weather for sensor 59899, 1/102
Fetching weather for sensor 105325, 2/102
Fetching weather for sensor 196735, 3/102
Fetching weather for sensor 462457, 4/102
Fetching weather for sensor 61045, 5/102
Fetching weather for sensor 497266, 6/102
Fetching weather for sensor 59593, 7/102
Fetching weather for sensor 62566, 8/102
Fetching weather for sensor 59887, 9/102
Fetching weather for sensor 163156, 10/102
Fetching weather for sensor 107110, 11/102
Fetching weather for sensor 63637, 12/102
Fetching weather for sensor 58666, 13/102
Fetching weather for sensor 88876, 14/102
Fetching weather for sensor 65146, 15/102
Fetching weather for sensor 129124, 16/102
Fetching weather for sensor 59893, 17/102
Fetching weather for sensor 87319, 18/102
Fetching weather for sensor 81505, 19/102
Fetching weather for sensor 113542, 20/102
Fetching weather for sensor 409513, 21/102
Fetching weather for sensor 65707, 22/102
Fetching weather for sensor 59356, 23/102
Fetching weather for sensor 61420,

### 2.4.9. Clean and Align Data Structure

In [16]:
cleaned_aq_rows = []

engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
base_cols = [c for c in historical.columns if c not in engineered_cols]

for i, df in enumerate(all_aq_rows):
    if df.empty or "pm25" not in df.columns or df["pm25"].isna().all():
        print(f"‚ö†Ô∏è Skipping empty or invalid df[{i}]")
        continue

    df["date"] = pd.to_datetime(df["date"]).dt.normalize().dt.tz_localize(None)

    # Skip if too few expected columns are present
    if len(set(df.columns) & set(base_cols)) < 3:
        print(f"‚ö†Ô∏è Skipping malformed df[{i}] with columns: {list(df.columns)}")
        continue

    # Align to base columns only (no engineered features yet)
    aligned = df.reindex(columns=base_cols, fill_value=np.nan)

    # Final sanity check
    if aligned.shape[1] != len(base_cols):
        print(f"‚ùå Still malformed after alignment: df[{i}] shape={aligned.shape}")
        continue

    # Force dtype alignment to match historical (for base columns only)
    for col in base_cols:
        if col in historical.columns:
            try:
                aligned[col] = aligned[col].astype(historical[col].dtype, errors="raise")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not cast column '{col}' in df[{i}]: {e}")
                continue

    cleaned_aq_rows.append(aligned)

print(f"üìã Cleaned {len(cleaned_aq_rows)} air quality dataframes")
print(f"üìã Using base columns (excluding engineered features): {len(base_cols)} columns")

‚ö†Ô∏è Skipping empty or invalid df[0]
üìã Cleaned 1117 air quality dataframes
üìã Using base columns (excluding engineered features): 9 columns


### 2.4.10. Combine and Clean Weather Data

In [17]:
if all_weather_rows:
    all_weather = pd.concat(all_weather_rows, ignore_index=True)
    all_weather = all_weather.sort_values(["sensor_id", "date"]).reset_index(drop=True)
    all_weather["date"] = pd.to_datetime(all_weather["date"]).dt.tz_localize(None)
    
    # Remove duplicates
    all_weather = all_weather.drop_duplicates(subset=["sensor_id", "date"], keep="first")
    
    print(f"üå§Ô∏è Total weather records: {len(all_weather)}")
    print(f"üìÖ Weather date range: {all_weather['date'].min()} to {all_weather['date'].max()}")
else:
    all_weather = pd.DataFrame()
    print("‚ö†Ô∏è No weather data collected")

üå§Ô∏è Total weather records: 1734
üìÖ Weather date range: 2026-02-09 00:00:00 to 2026-02-25 00:00:00


## 2.5. Combine Data and Add Engineered Features

In [18]:
# Drop engineered columns from historical data before combining
engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
historical_base = historical.drop(columns=engineered_cols, errors="ignore")

# Combine data
all_aq = pd.concat([historical_base, *cleaned_aq_rows], ignore_index=True)
all_aq = all_aq.sort_values(["sensor_id", "date"]).reset_index(drop=True)
all_aq["date"] = pd.to_datetime(all_aq["date"]).dt.normalize().dt.tz_localize(None)

# Remove duplicates: keep the first occurrence of each sensor_id + date combination
all_aq = all_aq.drop_duplicates(subset=["sensor_id", "date"], keep="first").reset_index(drop=True)

print(f"üìä Total records after deduplication: {len(all_aq)}")
print(f"üìä Unique sensors: {all_aq['sensor_id'].nunique()}")
print(f"üìä Date range: {all_aq['date'].min()} to {all_aq['date'].max()}")

# Add engineered features
all_aq = feature_engineering.add_rolling_window_feature(all_aq, window_days=3)
all_aq = feature_engineering.add_lagged_features(all_aq, lags=[1, 2, 3])

# Pass sensor_locations dict to nearby sensor feature
all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, sensor_locations, n_closest=3)

üìä Total records after deduplication: 1117
üìä Unique sensors: 102
üìä Date range: 2026-02-09 00:00:00 to 2026-02-19 00:00:00


## 2.6. Insert Data to Feature Groups

### 2.6.1. Insert Air Quality Data

In [19]:
if dates_to_insert:
    print(f"\nüîç Preparing to insert air quality data for {len(dates_to_insert)} dates")
    total_inserted = 0
    
    for day in dates_to_insert:
        day_rows = all_aq[all_aq["date"].dt.date == day].copy()
        
        # Show what we have before filtering
        print(f"\n   Date {day}: {len(day_rows)} total rows before filtering")
        
        # Filter out rows with missing pm25
        day_rows = day_rows.dropna(subset=["pm25"])
        print(f"   After pm25 filter: {len(day_rows)} rows")

        # Identify engineered feature columns
        engineered_cols = [c for c in day_rows.columns if "lag" in c or "rolling" in c or "nearby" in c]
        
        # Check which engineered features have NaN
        if not day_rows.empty:
            for col in engineered_cols:
                nan_count = day_rows[col].isna().sum()
                if nan_count > 0:
                    print(f"   ‚ö†Ô∏è  {col}: {nan_count}/{len(day_rows)} NaN values")
        
        # Filter out rows with missing engineered features
        day_rows = day_rows.dropna(subset=engineered_cols, how="any")
        print(f"   After engineered features filter: {len(day_rows)} rows")

        if not day_rows.empty:
            # Convert types to match feature group schema
            day_rows = day_rows.astype({
                "sensor_id": "int32",
                "pm25": "float64",
                "pm25_lag_1d": "float64",
                "pm25_lag_2d": "float64",
                "pm25_lag_3d": "float64",
                "pm25_rolling_3d": "float64",
                "pm25_nearby_avg": "float64",
                "city": "string",
                "street": "string",
                "country": "string",
                "aqicn_url": "string",
                "latitude": "float64",
                "longitude": "float64",
            })
            
            # Ensure correct column order
            fg_columns = [f.name for f in air_quality_fg.features]
            day_rows = day_rows[fg_columns]
            
            # Insert data to feature group
            try:
                air_quality_fg.insert(day_rows)
                total_inserted += len(day_rows)
                print(f"   ‚úÖ Inserted {len(day_rows)} rows for {day}")
            except Exception as e:
                print(f"   ‚ùå Error: {e}")
        else:
            print(f"   ‚ö†Ô∏è  No valid rows for {day}")
    
    print(f"\n‚úÖ Total air quality inserted: {total_inserted} records")
else:
    print("\n‚ö†Ô∏è  No air quality data to insert")


üîç Preparing to insert air quality data for 8 dates

   Date 2026-02-12: 100 total rows before filtering
   After pm25 filter: 100 rows
   ‚ö†Ô∏è  pm25_lag_3d: 1/100 NaN values
   After engineered features filter: 99 rows
2026-02-19 14:14:51,955 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://eu-west.cloud.hopsworks.ai:443/p/19575/fs/13380/fg/30733
   ‚ùå Error: Delta Lake (deltalake) and its dependencies are required for non-Spark operations. Install 'hops-deltalake' to enable Delta RS features.

   Date 2026-02-13: 102 total rows before filtering
   After pm25 filter: 102 rows
   After engineered features filter: 102 rows
2026-02-19 14:14:53,689 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://eu-west.cloud.hopsworks.ai:443/p/19575/fs/13380/fg/30733
   ‚ùå Error: Delta Lake (deltalake) and its

### 2.6.2. Insert Weather Forecast Data

In [20]:
if not all_weather.empty:
    print(f"\nüå§Ô∏è  Preparing to insert {len(all_weather)} weather records")
    
    # Convert types to match feature group schema
    all_weather = all_weather.astype({
        "sensor_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    # Ensure correct column order
    weather_fg_columns = [f.name for f in weather_fg.features]
    all_weather = all_weather[weather_fg_columns]
    
    # Insert in smaller batches to avoid connection issues
    batch_size = 100
    total_inserted = 0
    
    for i in range(0, len(all_weather), batch_size):
        batch = all_weather.iloc[i:i+batch_size]
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                weather_fg.insert(batch)
                total_inserted += len(batch)
                print(f"   ‚úÖ Weather batch {i//batch_size + 1}: {len(batch)} records (total: {total_inserted}/{len(all_weather)})")
                break
            except (ProtocolError, ConnectionError, TimeoutError, KafkaException) as e:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"   ‚ö†Ô∏è  Connection error on weather batch {i//batch_size + 1}, retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"   ‚ùå Failed weather batch {i//batch_size + 1}")
                    failed_file = f"{root_dir}/failed_weather_batch_{today}_{i}.csv"
                    batch.to_csv(failed_file, index=False)
                    print(f"   üíæ Saved to {failed_file}")
    
    print(f"\n‚úÖ Total weather inserted: {total_inserted}/{len(all_weather)} records")
else:
    print("\n‚ö†Ô∏è  No weather data to insert")


üå§Ô∏è  Preparing to insert 1734 weather records
2026-02-19 14:15:05,115 INFO: 	7 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://eu-west.cloud.hopsworks.ai:443/p/19575/fs/13380/fg/28686


ImportError: Delta Lake (deltalake) and its dependencies are required for non-Spark operations. Install 'hops-deltalake' to enable Delta RS features.

## 2.7. Pipeline Completion

In [None]:
print("=" * 80)
print("‚úÖ FEATURE PIPELINE COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"\nüìä Data Summary:")
print(f"   - Dates processed: {len(dates_to_insert)}")
print(f"   - Air quality records inserted: {total_inserted if dates_to_insert else 0}")
print(f"   - Weather records inserted: {len(all_weather) if not all_weather.empty else 0}")
print(f"\nüíæ Feature Groups Updated:")
print(f"   - {air_quality_fg.name} (v{air_quality_fg.version})")
print(f"   - {weather_fg.name} (v{weather_fg.version})")
print("\n" + "=" * 80)