# 2. Feature Pipeline

## 2.1. Setup

In [None]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, datetime, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError  
from requests.exceptions import ConnectionError, Timeout
from confluent_kafka import KafkaException
import numpy as np

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata

#  Load settings 
settings = config.HopsworksSettings()
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25
HopsworksSettings initialized!
2026-01-16 09:19:12,384 INFO: Initializing external client
2026-01-16 09:19:12,384 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-16 09:19:25,531 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


Repository management

In [2]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

Repository exists at c:\Users\krist\Documents\GitHub\pm25\notebooks\pm25-forecast-openmeteo-aqicn


In [3]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

## 2.3. Get Feature Groups

In [4]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.4. Load Metadata from Feature Group

In [5]:
# Load metadata from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

# Extract unique sensor metadata
metadata_df = aq_data[["sensor_id", "latitude", "longitude", "city", "street", "country", "aqicn_url"]].drop_duplicates(subset=["sensor_id"])
print(f"üìç Loaded metadata for {len(metadata_df)} sensors")
metadata_df = metadata_df.set_index("sensor_id")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.99s) 
üìç Loaded metadata for 103 sensors


## 2.5. Data Collection
Loop through all sensors to fetch today's air quality data and weather forecasts, format data to match feature group schemas.

Create a copy of dataframe and set up counters

In [6]:
# Load metadata from feature group for nearby sensor calculations
metadata_indexed = metadata_df.copy()
metadata_indexed.index = metadata_indexed.index.astype(int)

successful = 0
failed = 0
skipped = 0

print(f"üîç Processing {len(metadata_indexed)} sensor locations.")

üîç Processing 103 sensor locations.


Load historical Air Quality data for all sensors

In [7]:
historical_start = today - timedelta(days=4)
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_start_dt = pd.to_datetime(historical_start)
        
        # Include TODAY in historical data (we'll filter it out later per sensor)
        historical_df = historical_df[
            (historical_df["date"] >= historical_start_dt) & 
            (historical_df["date"] <= today_dt)  # Changed < to <=
        ][["date", "sensor_id", "pm25"]]
        
        historical_df = historical_df[historical_df["sensor_id"].isin(metadata_indexed.index)]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (8.49s) 


Initialize containers for results

In [8]:
aq_list = []
weather_dict = {}  # sensor_id -> weather_df

Data collection

In [9]:
existing_dates = air_quality_fg.read()["date"].dt.date.unique()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (15.16s) 


In [10]:
today = datetime.today().date()
start_date = today - timedelta(days=7)  # or however far back you want to check

expected_dates = pd.date_range(start=start_date, end=today, freq="D").date
missing_dates = [d for d in expected_dates if d not in existing_dates]

Load historical data

In [11]:
historical_cutoff = pd.to_datetime(min(missing_dates)) - pd.Timedelta(days=3)
# historical_cutoff = min(missing_dates) - timedelta(days=3)
historical = air_quality_fg.read()
historical["date"] = pd.to_datetime(historical["date"]).dt.tz_localize(None)
historical = historical [historical["date"] >= historical_cutoff]
# historical = historical[historical["date"] >= pd.to_datetime(historical_cutoff)]

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (14.98s) 


Track existing sensor-date pairs

In [12]:
existing = historical[["sensor_id", "date"]].copy()
existing["date_only"] = existing["date"].dt.date
existing_keys = set(zip(existing["sensor_id"], existing["date_only"]))

Add historical data

In [13]:
all_aq_rows = [historical]

Fetch missing sensor-date combinations

In [15]:
count = 1
for sensor_id, meta in metadata_df.iterrows():
    print(f"Processing sensor {sensor_id}, {count}/{len(metadata_df)}")
    count += 1
    for day in missing_dates:
        if (sensor_id, day) in existing_keys:
            continue  # Already exists in Hopsworks, skip API call
        try:
            aq_df = fetchers.get_pm25(
                meta["aqicn_url"], meta["country"], meta["city"],
                meta["street"], day, AQICN_API_KEY
            )
            if aq_df.empty or aq_df["pm25"].isna().all():
                continue

            aq_df["sensor_id"] = int(sensor_id)
            aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce")
            aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
            
            # Add metadata columns
            aq_df["city"] = meta["city"]
            aq_df["street"] = meta["street"]
            aq_df["country"] = meta["country"]
            aq_df["aqicn_url"] = meta["aqicn_url"]
            aq_df["latitude"] = meta["latitude"]
            aq_df["longitude"] = meta["longitude"]
            
            aq_df = aq_df.drop(columns=["url"], errors="ignore")

            all_aq_rows.append(aq_df)

        except Exception as e:
            print(f"‚ùå Sensor {sensor_id} on {day}: {type(e).__name__}")
            continue

Processing sensor 60853, 1/103
Processing sensor 59497, 2/103
Processing sensor 59650, 3/103
Processing sensor 112672, 4/103
Processing sensor 60889, 5/103
Processing sensor 60076, 6/103
Processing sensor 58921, 7/103
Processing sensor 84085, 8/103
Processing sensor 89584, 9/103
Processing sensor 198559, 10/103
Processing sensor 149242, 11/103
Processing sensor 105325, 12/103
Processing sensor 78529, 13/103
Processing sensor 88876, 14/103
Processing sensor 65272, 15/103
Processing sensor 77488, 16/103
Processing sensor 351115, 17/103
Processing sensor 122302, 18/103
Processing sensor 196735, 19/103
Processing sensor 69724, 20/103
Processing sensor 60859, 21/103
Processing sensor 65146, 22/103
Processing sensor 57421, 23/103
Processing sensor 194215, 24/103
Processing sensor 82384, 25/103
Processing sensor 180187, 26/103
Processing sensor 68167, 27/103
Processing sensor 129124, 28/103
Processing sensor 79999, 29/103
Processing sensor 59593, 30/103
Processing sensor 462457, 31/103
Proces

In [17]:
cleaned_aq_rows = []
expected_cols = historical.columns.tolist()

for i, df in enumerate(all_aq_rows):
    if df.empty or "pm25" not in df.columns or df["pm25"].isna().all():
        print(f"‚ö†Ô∏è Skipping empty or invalid df[{i}]")
        continue

    df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    # Skip if too few expected columns are present
    if len(set(df.columns) & set(expected_cols)) < 3:
        print(f"‚ö†Ô∏è Skipping malformed df[{i}] with columns: {list(df.columns)}")
        continue

    # Align columns
    aligned = df.reindex(columns=expected_cols, fill_value=np.nan)

    # Final sanity check
    if aligned.shape[1] != len(expected_cols):
        print(f"‚ùå Still malformed after alignment: df[{i}] shape={aligned.shape}")
        continue

    # Force dtype alignment to match historical
    for col in expected_cols:
        if col in historical.columns:
            try:
                aligned[col] = aligned[col].astype(historical[col].dtype, errors="raise")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not cast column '{col}' in df[{i}]: {e}")
                continue

    cleaned_aq_rows.append(aligned)

# print(f"‚úÖ historical shape: {historical.shape}")
# for i, df in enumerate(cleaned_aq_rows):
#     print(f"‚úÖ cleaned_aq_rows[{i}] shape: {df.shape}")

print("üìã Column names match:", all(df.columns.equals(historical.columns) for df in cleaned_aq_rows))

for i, df in enumerate(cleaned_aq_rows):
    mismatched = [(col, df[col].dtype, historical[col].dtype)
                  for col in df.columns if col in historical.columns and df[col].dtype != historical[col].dtype]
    if mismatched:
        print("üìã Dtype mismatch:")
        print(f"  df[{i}] mismatches: {mismatched}")

all_aq = pd.concat([historical, *cleaned_aq_rows], ignore_index=True)
all_aq = all_aq.sort_values(["sensor_id", "date"]).reset_index(drop=True)
all_aq["date"] = pd.to_datetime(all_aq["date"]).dt.tz_localize(None)

# Feature engineering
all_aq = feature_engineering.add_rolling_window_feature(all_aq, window_days=3)
all_aq = feature_engineering.add_lagged_features(all_aq, lags=[1, 2, 3])
metadata_indexed = metadata_indexed.reset_index()
all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, metadata_indexed, n_closest=3)

üìã Column names match: True


In [18]:
for day in missing_dates:
    day_rows = all_aq[all_aq["date"].dt.date == day].copy()
    day_rows = day_rows.dropna(subset=["pm25"])

    engineered_cols = [c for c in day_rows.columns if "lag" in c or "rolling" in c or "nearby" in c]
    day_rows = day_rows.dropna(subset=engineered_cols, how="any")

    if not day_rows.empty:
        # Convert types to match feature group schema
        day_rows = day_rows.astype({
            "sensor_id": "int32",
            "pm25": "float64",
            "pm25_lag_1d": "float64",
            "pm25_lag_2d": "float64",
            "pm25_lag_3d": "float64",
            "pm25_rolling_3d": "float64",
            "pm25_nearby_avg": "float64",
            "city": "string",
            "street": "string",
            "country": "string",
            "aqicn_url": "string",
            "latitude": "float64",
            "longitude": "float64",
        })
        
        # Ensure correct column order
        fg_columns = [f.name for f in air_quality_fg.features]
        day_rows = day_rows[fg_columns]
        
        air_quality_fg.insert(day_rows)
        print(f"‚úÖ Inserted {len(day_rows)} rows for {day}")
    else:
        print(f"‚ö†Ô∏è No valid rows for {day}")

‚ö†Ô∏è No valid rows for 2026-01-09
2026-01-16 09:38:19,173 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1/1 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_1_offline_fg_materialization/executions
‚úÖ Inserted 1 rows for 2026-01-10
2026-01-16 09:39:01,006 INFO: 	8 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 2/2 | Elapsed Time: 00:01 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 2 rows for 2026-01-15
2026-01-16 09:39:25,811 INFO: 	8 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 2/2 | Elapsed Time: 00:01 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kristina_titanic/Resources/jobs/air_quality_1_offline_fg_materialization/config_1768459798660) to trigger the materialization job again.

‚úÖ Inserted 2 rows for 2026-01-16


Build a unified dataframe

In [19]:
# all_aq_rows = []   # raw air quality rows for all sensors
# weather_dict = {}  # weather per sensor

# for sensor_id, meta in metadata_df.iterrows():
#     try:
#         # Fetch today's PM2.5
#         aq_today_df = fetchers.get_pm25(
#             meta["aqicn_url"], meta["country"], meta["city"],
#             meta["street"], today, AQICN_API_KEY
#         )

#         if aq_today_df.empty or aq_today_df["pm25"].isna().all():
#             continue

#         # Format
#         aq_today_df["sensor_id"] = int(sensor_id)
#         aq_today_df["pm25"] = pd.to_numeric(aq_today_df["pm25"], errors="coerce")
#         aq_today_df["date"] = pd.to_datetime(aq_today_df["date"]).dt.tz_localize(None)
#         aq_today_df = aq_today_df.drop(columns=["url", "country", "city", "street"], errors="ignore")

#         # Add historical rows for this sensor
#         if not historical_df.empty:
#             hist = historical_df[
#                 (historical_df["sensor_id"] == sensor_id) &
#                 (historical_df["date"].dt.date < today)
#             ]
#             if not hist.empty:
#                 all_aq_rows.append(hist)

#         # Add today's row
#         all_aq_rows.append(aq_today_df)

#         # Fetch weather once per sensor
#         if sensor_id not in weather_dict:
#             end_date = today + timedelta(days=7)
#             wdf = fetchers.get_weather_forecast(
#                 sensor_id, today, end_date, meta["latitude"], meta["longitude"]
#             )
#             if not wdf.empty:
#                 wdf["sensor_id"] = sensor_id
#                 wdf["date"] = pd.to_datetime(wdf["date"]).dt.tz_localize(None)
#                 weather_dict[sensor_id] = wdf

#     except Exception as e:
#         print(f"‚ùå Sensor {sensor_id}: {type(e).__name__}")
#         continue

Combine all sensors into one datafram and add engineered features

In [20]:
# # Combine all sensors into one dataframe
# all_aq = pd.concat(all_aq_rows, ignore_index=True)
# all_aq = all_aq.sort_values(["sensor_id", "date"]).reset_index(drop=True)

# # Ensure datetime is clean
# all_aq["date"] = pd.to_datetime(all_aq["date"]).dt.tz_localize(None)

# min_date = today - timedelta(days=4)
# all_aq = all_aq[all_aq["date"].dt.date >= min_date]

# # Apply feature engineering across all sensors
# all_aq = feature_engineering.add_rolling_window_feature(all_aq, window_days=3)
# all_aq = feature_engineering.add_lagged_features(all_aq, lags=[1, 2, 3])
# all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, metadata_indexed, n_closest=3)

In [21]:
print(all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_rolling_3d", "pm25_nearby_avg"]])

     sensor_id       date  pm25  pm25_lag_1d  pm25_rolling_3d  pm25_nearby_avg
14       57421 2026-01-16  20.0         17.0        13.666667        40.333333
15       57421 2026-01-16  17.0         20.0        19.000000        40.333333
19       58666 2026-01-16   7.0          7.0         7.000000              NaN
23       58909 2026-01-16  32.0         32.0        32.000000              NaN
30       58912 2026-01-16  17.0         18.0        17.666667              NaN
..         ...        ...   ...          ...              ...              ...
638     494275 2026-01-16   6.0          4.0         4.666667              NaN
642     497266 2026-01-16   7.0          7.0         7.000000              NaN
646     533086 2026-01-16  46.0         46.0        46.000000              NaN
650     556792 2026-01-16  56.0         56.0        56.000000              NaN
654     562600 2026-01-16  29.0         29.0        29.000000              NaN

[161 rows x 6 columns]


In [22]:
print(all_aq[all_aq["date"].dt.date == today - timedelta(days=1)])

     sensor_id       date  pm25  pm25_lag_1d  pm25_lag_2d  pm25_lag_3d  \
12       57421 2026-01-15  20.0          4.0          4.0          4.0   
13       57421 2026-01-15  17.0         20.0          4.0          4.0   
18       58666 2026-01-15   7.0          7.0          7.0          NaN   
22       58909 2026-01-15  32.0         32.0         32.0          NaN   
28       58912 2026-01-15  17.0         18.0         17.0         18.0   
..         ...        ...   ...          ...          ...          ...   
636     494275 2026-01-15   6.0          4.0          6.0          4.0   
641     497266 2026-01-15   7.0          7.0          7.0          NaN   
645     533086 2026-01-15  46.0         46.0         46.0          NaN   
649     556792 2026-01-15  56.0         56.0         56.0          NaN   
653     562600 2026-01-15  29.0         29.0         29.0          NaN   

     pm25_rolling_3d  pm25_nearby_avg         city          street country  \
12          4.000000        40.33

Extract todays engineered rows for insertion

In [23]:
today_rows = all_aq[all_aq["date"].dt.date == today].copy()

# Drop rows with missing target
today_rows = today_rows.dropna(subset=["pm25"])

# Optional: drop rows missing engineered features
engineered_cols = [c for c in today_rows.columns if "lag" in c or "rolling" in c or "nearby" in c]
today_rows = today_rows.dropna(subset=engineered_cols, how="any")

print(f"Engineered rows for today: {len(today_rows)}")

Engineered rows for today: 2


In [24]:
# for sensor_id, meta in metadata_df.iterrows():
#     try:
#         # Fetch current air quality
#         aq_today_df = fetchers.get_pm25(meta["aqicn_url"], meta["country"], meta["city"], 
#                                        meta["street"], today, AQICN_API_KEY)
        
#         if aq_today_df.empty or aq_today_df['pm25'].isna().all():
#             skipped += 1
#             continue
        
#         # Format air quality data
#         aq_today_df["sensor_id"] = int(sensor_id)
#         aq_today_df["pm25"] = pd.to_numeric(aq_today_df["pm25"], errors="coerce")
#         aq_today_df["date"] = pd.to_datetime(aq_today_df["date"]).dt.tz_localize(None)
#         aq_today_df = aq_today_df.drop(columns=["url", "country", "city", "street"], errors="ignore")
        
#         # Combine with historical data (last 4 days)
#         if not historical_df.empty:
#             sensor_historical = historical_df[
#                 (historical_df["sensor_id"] == sensor_id) & 
#                 (historical_df["date"].dt.date < today)
#             ]
#         else:
#             sensor_historical = pd.DataFrame()
        
#         combined = pd.concat([sensor_historical, aq_today_df], ignore_index=True) if not sensor_historical.empty else aq_today_df
#         combined = combined.sort_values("date").reset_index(drop=True)
        
#         # Add features using historical + todays data
#         combined = feature_engineering.add_rolling_window_feature(combined, window_days=3, column="pm25", new_column="pm25_rolling_3d")
#         combined = feature_engineering.add_lagged_features(combined, column="pm25", lags=[1, 2, 3])
#         combined = feature_engineering.add_nearby_sensor_feature(combined, metadata_indexed, n_closest=3)
        
#         # Only filter out future dates if any exist
#         combined = combined[combined["date"].dt.date <= today].copy()
        
#         if combined.empty or combined['pm25'].isna().all():
#             skipped += 1
#             continue
        
#         aq_list.append(combined)

        
#         # Fetch weather for each sensor
#         if sensor_id not in weather_dict:
#             end_date = today + timedelta(days=7)
#             weather_df = fetchers.get_weather_forecast(sensor_id, today, end_date, 
#                                                       meta["latitude"], meta["longitude"])
#             if not weather_df.empty:
#                 weather_df["sensor_id"] = sensor_id
#                 weather_df["date"] = pd.to_datetime(weather_df["date"])
#                 weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
#                 weather_dict[sensor_id] = weather_df
        
#         successful += 1
#         if successful % 10 == 0:
#             print(f"‚úÖ Processed {successful}/{len(metadata_df)} sensors")
            
#     except Exception as e:
#         failed += 1
#         print(f"‚ùå Sensor {sensor_id}: {type(e).__name__}")
#         continue
        

Batch insert Air Quality

In [25]:
if aq_list:
    all_aq = pd.concat(aq_list, ignore_index=True)
    
    # Convert types
    all_aq = all_aq.astype({
        "sensor_id": "int32",
        "pm25": "float64",
        "pm25_lag_1d": "float64",
        "pm25_lag_2d": "float64",
        "pm25_lag_3d": "float64",
        "pm25_rolling_3d": "float64",
        "pm25_nearby_avg": "float64",
        "city": "string",
        "street": "string",
        "country": "string",
        "aqicn_url": "string",
        "latitude": "float64",
        "longitude": "float64",
    })
    
    # Ensure correct column order
    fg_columns = [f.name for f in air_quality_fg.features]
    all_aq = all_aq[fg_columns]
    
    air_quality_fg.insert(all_aq)
    print(f"üìä Inserted {len(all_aq)} air quality records")

Batch insert Weather

In [26]:
if weather_dict:
    all_weather = pd.concat(weather_dict.values(), ignore_index=True)
    
    # Convert types
    all_weather = all_weather.astype({
        "sensor_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    # Insert in smaller batches
    batch_size = 100
    total_inserted = 0
    
    for i in range(0, len(all_weather), batch_size):
        batch = all_weather.iloc[i:i+batch_size]
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                weather_fg.insert(batch)
                total_inserted += len(batch)
                print(f"‚úÖ Weather batch {i//batch_size + 1}: {len(batch)} records (total: {total_inserted}/{len(all_weather)})")
                break
            except (ProtocolError, ConnectionError, TimeoutError, KafkaException) as e:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"‚ö†Ô∏è  Connection error on weather batch {i//batch_size + 1}, retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Failed weather batch {i//batch_size + 1}")
                    failed_file = f"{root_dir}/failed_weather_batch_{today}_{i}.csv"
                    batch.to_csv(failed_file, index=False)
                    print(f"üíæ Saved to {failed_file}")
    
    print(f"üå§Ô∏è Total inserted: {total_inserted}/{len(all_weather)} weather records")

Print summary

In [27]:
print(f"\nüìä Summary: ‚úÖ {successful} successful, ‚è≠Ô∏è {skipped} skipped, ‚ùå {failed} failed")


üìä Summary: ‚úÖ 0 successful, ‚è≠Ô∏è 0 skipped, ‚ùå 0 failed


## 2.6. Inspect Inserted Data

In [28]:
if 'all_aq' in locals() and not all_aq.empty:
    print(f"‚úÖ Air quality records inserted: {len(all_aq)}")
    print("\nüìã Sample air quality data:")
    print(all_aq.head())
    print("\nüîß Air quality data types:")
    print(all_aq.dtypes)
    print("\nüìÖ Date range:")
    print(f"From {all_aq['date'].min()} to {all_aq['date'].max()}")

if 'all_weather' in locals() and not all_weather.empty:
    print(f"\nüå§Ô∏è Weather records inserted: {len(all_weather)}")
    print("\nüìã Sample weather data:")
    print(all_weather.head())
    print("\nüîß Weather data types:")
    print(all_weather.dtypes)
    print("\nüìÖ Unique weather dates:")
    print(all_weather['date'].unique())

‚úÖ Air quality records inserted: 655

üìã Sample air quality data:
   sensor_id       date  pm25  pm25_lag_1d  pm25_lag_2d  pm25_lag_3d  \
0      57421 2026-01-09  20.0          NaN          NaN          NaN   
1      57421 2026-01-09  17.0         20.0          NaN          NaN   
2      57421 2026-01-10  20.0         17.0         20.0          NaN   
3      57421 2026-01-10  17.0         20.0         17.0         20.0   
4      57421 2026-01-11   4.0         17.0         20.0         17.0   

   pm25_rolling_3d  pm25_nearby_avg         city street country  \
0              NaN        40.000000  Johannehill   Ubby  Sweden   
1             20.0        40.000000  Johannehill   Ubby  Sweden   
2             18.5        40.333333  Johannehill   Ubby  Sweden   
3             19.0        40.333333  Johannehill   Ubby  Sweden   
4             18.0              NaN  Johannehill   Ubby  Sweden   

                            aqicn_url  latitude  longitude  
0  https://api.waqi.info/feed/A574