# 1. Backfill Pipeline

## 1.1. Setup

### 1.1.1. Import Libraries

In [50]:
# Standard imports
import os
from pathlib import Path
import sys
import json
import time
from datetime import date, datetime, timedelta
from dotenv import load_dotenv
import warnings

warnings.filterwarnings("ignore", module="IPython")
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import numpy as np
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from confluent_kafka import KafkaException
from hsfs.client.exceptions import RestAPIError
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from scipy.spatial.distance import cdist

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

today = datetime.today().date()

Project root dir: c:\Users\krist\Documents\GitHub\pm25


### 1.1.2. Load settings and Initialize Hopsworks Connection

In [51]:

def detect_environment():
    if (
        "HOPSWORKS_JOB_ID" in os.environ
        or "HOPSWORKS_PROJECT_ID" in os.environ
        or "HOPSWORKS_JOB_NAME" in os.environ
    ):
        return "job"

    cwd = os.getcwd()
    if cwd.startswith("/hopsfs/Jupyter"):
        return "jupyter"

    return "local"

env = detect_environment()
print(f"Detected environment: {env}")

# Load secrets based on environment

if env in ("job", "jupyter"):
    project = hopsworks.login()
    secrets_api = hopsworks.get_secrets_api()

    for key in ["HOPSWORKS_API_KEY", "AQICN_API_KEY", "GH_PAT", "GH_USERNAME"]:
        os.environ[key] = secrets_api.get_secret(key).value

else:
    load_dotenv()

# Load Pydantic settings

settings = config.HopsworksSettings()

HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks using the API key

project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

print("Environment initialized and Hopsworks connected!")


Detected environment: local
HopsworksSettings initialized!
2026-01-30 07:25:42,094 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-30 07:25:42,108 INFO: Initializing external client
2026-01-30 07:25:42,108 INFO: Base URL: https://c.app.hopsworks.ai:443


2026-01-30 07:25:43,690 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184
Environment initialized and Hopsworks connected!


### 1.1.3. Repository management

In [52]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

üíª Already in git repository at c:\Users\krist\Documents\GitHub\pm25


### 1.1.4. Configure API Keys and Secrets

In [53]:
secrets = hopsworks.get_secrets_api()

try:
    secrets.get_secret("AQICN_API_KEY")
except:
    secrets.create_secret("AQICN_API_KEY", settings.AQICN_API_KEY.get_secret_value())

## 1.2. Create Feature Groups

In [None]:
# Clean Up Old Resources (if recreating)

# Delete feature view first (blocks feature group deletion)
print("üóëÔ∏è Step 1: Delete feature view (if exists)...")
try:
    hopsworks_admin.delete_feature_views(fs, "air_quality_complete_fv")
except Exception as e:
    print(f"   Note: {e}")

# Delete old feature groups
print("\nüóëÔ∏è Step 2: Delete old feature groups (if exist)...")
try:
    hopsworks_admin.delete_feature_groups(fs, "air_quality")
except Exception as e:
    print(f"   Note: {e}")

try:
    hopsworks_admin.delete_feature_groups(fs, "weather")
except Exception as e:
    print(f"   Note: {e}")

print("\n‚úÖ Cleanup complete - ready to create fresh feature groups")

üóëÔ∏è Step 1: Delete feature view (if exists)...

Deleted air_quality_complete_fv/1

üóëÔ∏è Step 2: Delete old feature groups (if exist)...

   Note: Remote end closed connection without response

Deleted weather/1

‚úÖ Cleanup complete - ready to create fresh feature groups


In [55]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1984891
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1984892


## 1.3. Check what backfill is needed

In [56]:
data_dir = os.path.join(root_dir, "data")
dir_list = os.listdir(data_dir)

sensor_locations = metadata.get_sensor_locations(air_quality_fg)
existing_sensors = set(sensor_locations.keys())

print(f"üìã Found {len(existing_sensors)} sensors already in feature store")
print(f"üìç Loaded locations for {len(sensor_locations)} existing sensors")

total_sensors = len([f for f in dir_list if f.endswith(".csv")])
remaining = total_sensors - len(existing_sensors)
print(f"üìä Total sensors: {total_sensors}, Already processed: {len(existing_sensors)}, Remaining: {remaining}")

2026-01-30 07:27:22,137 ERROR: [Errno 2] Opening HDFS file '/apps/hive/warehouse/kristina_titanic_featurestore.db/air_quality_1/.hoodie/hoodie.properties' failed. Detail: [errno 2] No such file or directory. Detail: Python exception: Traceback (most recent call last):
  File "/usr/src/app/src/server.py", line 142, in wrapper
    result = func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/src/app/src/server.py", line 166, in wrapper
    result = func(instance, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/src/app/src/server.py", line 196, in do_get
    return self._read_query(context, path, command)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/src/app/src/server.py", line 123, in wrapper
    return func(instance, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/src/app/src/server.py", line 131, in wrapper
    result = func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/src/app/s

## 1.4. Backfill
When performed for the first time, might take a long time if many added sensors.

In [None]:

if total_sensors != len(existing_sensors):
    print("\nüöÄ Starting backfill process...\n")
    # Track processing stats
    successful = 0
    failed = 0
    skipped = 0
    failed_sensors = []  # Track which sensors failed and why

    for file in dir_list:
        if not file.endswith(".csv"):
            continue

        file_path = os.path.join(data_dir, file)
        
        try:
            aq_df_raw, street, city, country, feed_url, sensor_id = metadata.read_sensor_data(
                file_path, AQICN_API_KEY
            )

            sensor_id = int(sensor_id)

            # Skip if already processed
            if sensor_id in existing_sensors:
                skipped += 1
                continue

            # Get working feed URL using sensor ID and API token
            try:
                working_feed_url = fetchers.get_working_feed_url(sensor_id, AQICN_API_KEY)
            except Exception as url_err:
                print(f"‚ö†Ô∏è Sensor {sensor_id}: Could not resolve feed URL - {url_err}")
                working_feed_url = feed_url  # Fallback to CSV feed_url if resolution fails

            # Get coordinates for this sensor
            lat, lon = metadata.get_coordinates(city, street, country)
            
            if lat is None or lon is None:
                print(f"‚ö†Ô∏è Sensor {sensor_id}: cannot geocode location")
                failed += 1
                failed_sensors.append((sensor_id, "Geocoding failed"))
                continue

            # Clean and prepare air quality data 
            aq_df = cleaning.clean_and_append_data(
                aq_df_raw, sensor_id, 
                city=city, street=street, country=country,
                latitude=lat, longitude=lon, aqicn_url=working_feed_url
            )
            aq_df = aq_df.sort_values("date").drop_duplicates(subset=["date"], keep="first").reset_index(drop=True)
            
            # DEBUG: Check for duplicate columns after cleaning
            print(f"\nüîç DEBUG Sensor {sensor_id} - After cleaning:", flush=True)
            print(f"   Columns: {aq_df.columns.tolist()}", flush=True)
            print(f"   'date' count: {aq_df.columns.tolist().count('date')}", flush=True)
            
            # Add features one by one, checking for duplicates after each
            aq_df = feature_engineering.add_lagged_features(aq_df, "pm25", lags=[1,2,3])
            print(f"   After lagged features - 'date' count: {aq_df.columns.tolist().count('date')}", flush=True)
            
            aq_df = feature_engineering.add_rolling_window_feature(aq_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
            print(f"   After rolling window - 'date' count: {aq_df.columns.tolist().count('date')}", flush=True)
            
            # Calculate nearby sensor feature using location dict
            if len(sensor_locations) > 0:
                aq_df = feature_engineering.add_nearby_sensor_feature(
                    aq_df, 
                    sensor_locations,
                    n_closest=3
                )
            else:
                aq_df["pm25_nearby_avg"] = 0.0
            
            print(f"   After nearby sensor - 'date' count: {aq_df.columns.tolist().count('date')}", flush=True)
            print(f"   Final columns before weather: {aq_df.columns.tolist()}", flush=True)
            
            # Date range for weather
            end_date = aq_df["date"].max().date()
            start_date = end_date - timedelta(days=365 * 3)

            # Fetch weather
            weather_df = fetchers.get_historical_weather(
                sensor_id, start_date, end_date, lat, lon
            )
            
            if weather_df is None or len(weather_df) == 0:
                print(f"‚ö†Ô∏è No weather data for sensor {sensor_id}")
                failed += 1
                failed_sensors.append((sensor_id, "No weather data"))
                continue

            # Prepare weather data
            weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.tz_localize(None)
            weather_df["sensor_id"] = int(sensor_id)
            weather_df = weather_df.astype({
                "sensor_id": "int32",
                "temperature_2m_mean": "float64",
                "precipitation_sum": "float64",
                "wind_speed_10m_max": "float64",
                "wind_direction_10m_dominant": "float64",
            })
            
            print(f"   Weather columns before insert: {weather_df.columns.tolist()}", flush=True)
            print(f"   Weather 'date' count: {weather_df.columns.tolist().count('date')}", flush=True)
            
            # Insert without triggering materialization
            weather_fg.insert(weather_df, write_options={"start_offline_materialization": False})
            print(f"   ‚úÖ Weather inserted successfully", flush=True)

            # Prepare air quality data
            print(f"\nüîç Preparing AQ insert for Sensor {sensor_id}", flush=True)

            # Ensure date is properly formatted
            aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)

            # Ensure proper dtypes
            aq_df = aq_df.astype({
                "sensor_id": "int32",
                "pm25": "float64",
                "pm25_lag_1d": "float64",
                "pm25_lag_2d": "float64",
                "pm25_lag_3d": "float64",
                "pm25_rolling_3d": "float64",
                "pm25_nearby_avg": "float64",
                "city": "string",
                "street": "string",
                "country": "string",
                "aqicn_url": "string",
                "latitude": "float64",
                "longitude": "float64",
            })
            
            # Final verification
            print(f"   Columns to insert: {aq_df.columns.tolist()}", flush=True)
            print(f"   Shape: {aq_df.shape}", flush=True)
            
            # Insert without triggering materialization
            air_quality_fg.insert(aq_df, write_options={"start_offline_materialization": False})

            existing_sensors.add(sensor_id)
            
            # Add this sensor's location to dict for subsequent nearby calculations
            sensor_locations[sensor_id] = (lat, lon, city, street, country)
            
            successful += 1
            print(f"‚úÖ Sensor {sensor_id} ({successful}/{remaining} complete)")

        except Exception as e:
            failed += 1
            failed_sensors.append((sensor_id, f"{type(e).__name__}: {str(e)[:100]}"))
            print(f"‚ùå Sensor {sensor_id}: {type(e).__name__}: {str(e)}")
            continue
    
    print(f"\nüéâ Backfill complete!")
    print(f"üìä Final Summary:")
    print(f"   ‚úÖ Successfully processed: {successful}")
    print(f"   ‚ùå Failed: {failed}")
    print(f"   ‚è© Skipped (already processed): {skipped}")
    print(f"   üìà Total in feature store: {len(existing_sensors)}/{total_sensors}")

    if len(failed_sensors) > 0:
        print(f"\n‚ö†Ô∏è  Failed Sensors Detail:")
        for sid, reason in failed_sensors:
            print(f"   ‚Ä¢ Sensor {sid}: {reason}")

else:
    print("\n‚úÖ All sensors already processed. No backfill needed.")



üöÄ Starting backfill process...


üîç DEBUG Sensor 105325 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1119/1119 | Elapsed Time: 00:01 | Remaining Time: 00:00


   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 105325
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1686, 14)


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1686/1686 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 105325: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 107110 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 107110
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1510, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1510/1510 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 107110: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 112672 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 112672
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (2004, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 2004/2004 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 112672: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 112993 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 112993
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (2006, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 2006/2006 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 112993: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 113539 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:00 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 113539
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1391, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1391/1391 | Elapsed Time: 00:02 | Remaining Time: 00:00


‚ùå Sensor 113539: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 113542 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 113542
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (410, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 410/410 | Elapsed Time: 00:00 | Remaining Time: 00:00


‚ùå Sensor 113542: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 121810 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 121810
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1872, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1872/1872 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 121810: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 122302 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 122302
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1980, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1980/1980 | Elapsed Time: 00:02 | Remaining Time: 00:00


‚ùå Sensor 122302: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 128095 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 128095
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1804, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1804/1804 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 128095: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 129124 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1127/1127 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully






üîç Preparing AQ insert for Sensor 129124
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1961, 14)


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1961/1961 | Elapsed Time: 00:02 | Remaining Time: 00:00


‚ùå Sensor 129124: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 149242 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 149242
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1703, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1703/1703 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 149242: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 154549 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1118/1118 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 154549
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1872, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1872/1872 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 154549: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 163156 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 163156
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1872, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1872/1872 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 163156: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 180187 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1127/1127 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 180187
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (1464, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1464/1464 | Elapsed Time: 00:02 | Remaining Time: 00:00


‚ùå Sensor 180187: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 191047 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1
‚ùå Sensor 191047: ConnectionError: HTTPSConnectionPool(host='c.app.hopsworks.ai', port=443): Max retries exceeded with url: /hopsworks-api/api/project/1279184/featurestores/1265800/featuregroups/1984892/expectationsuite (Caused by Na

Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 420664
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (754, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 754/754 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 420664: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 462457 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 462457
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (703, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 703/703 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 462457: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 474841 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1126/1126 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 474841
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (586, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 586/586 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 474841: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 476353 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 476353
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (596, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 596/596 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 476353: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 494275 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1104/1104 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 494275
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (497, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 497/497 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 494275: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 497266 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1103/1103 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 497266
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (411, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 411/411 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 497266: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 533086 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 533086
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (246, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 246/246 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 533086: FeatureStoreException: No materialization job was found

üîç DEBUG Sensor 556792 - After cleaning:
   Columns: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url']
   'date' count: 1
   After lagged features - 'date' count: 1
   After rolling window - 'date' count: 1
   After nearby sensor - 'date' count: 1
   Final columns before weather: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Weather columns before insert: ['date', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant', 'sensor_id']
   Weather 'date' count: 1


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1129/1129 | Elapsed Time: 00:01 | Remaining Time: 00:00

   ‚úÖ Weather inserted successfully

üîç Preparing AQ insert for Sensor 556792
   Columns to insert: ['pm25', 'date', 'sensor_id', 'city', 'street', 'country', 'latitude', 'longitude', 'aqicn_url', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_rolling_3d', 'pm25_nearby_avg']
   Shape: (111, 14)



Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 111/111 | Elapsed Time: 00:01 | Remaining Time: 00:00


‚ùå Sensor 556792: FeatureStoreException: No materialization job was found


## 1.5. Update Descriptions

In [None]:
# hopsworks_admin.update_air_quality_description(air_quality_fg)
# hopsworks_admin.update_weather_description(weather_fg)

## 1.6. Add Validation to Feature Groups

In [None]:
aq_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

# pm25 should be >= 0
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "pm25",
            "min_value": 0.0,
            "max_value": None,
            "strict_min": False,
        },
    )
)

aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column": "date",
            "type_list": ["datetime64", "Datetime", "Null"],
        },
    )
)


# sensor_id + date should be unique (PK)
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_compound_columns_to_be_unique",
        kwargs={"column_list": ["sensor_id", "date"]},
    )
)

# rolling + lag features should be numeric (float or int)
for col in ["pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d"]:
    aq_expectation_suite.add_expectation(
        gx.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_type_list",
            kwargs={
                "column": col,
                "type_list": ["float64", "Float64", "Int64", "Null"],
            },
        )
    )

aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_table_row_count_to_be_between",
        kwargs={"min_value": 1, "max_value": None}
    )
)

hopsworks_admin.save_or_replace_expectation_suite(air_quality_fg, aq_expectation_suite)


weather_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(   
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column": "date",
            "type_list": ["datetime64", "Datetime", "Null"],
        },
    )
)

# Temperature column - allow nulls, should be within physical range
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "temperature_2m_mean",
            "min_value": -80,
            "max_value": 60,
            "mostly": 1.0,
        },
    )
)
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column": "temperature_2m_mean",
            "type_list": ["float64", "Float64", "Int64", "Null"],
        },
    )
)

# Precipitation column - should be >= 0, allow nulls
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "precipitation_sum",
            "min_value": -0.1,
            "max_value": None,
            "mostly": 1.0,          # allow nulls
        },
    )
)
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column": "precipitation_sum",
            "type_list": ["float64", "Float64", "Int64", "Null"],
        },
    )
)

# Wind column - should be >= 0, allow nulls
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "wind_speed_10m_max",
            "min_value": 0,
            "max_value": None,
            "mostly": 1.0,          # allow nulls
        },
    )
)
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column": "wind_speed_10m_max",
            "type_list": ["float64", "Float64", "Int64", "Null"],
        },
    )
)

gx.core.ExpectationConfiguration(
    expectation_type="expect_table_row_count_to_be_between",
    kwargs={"min_value": 1, "max_value": None}
)

hopsworks_admin.save_or_replace_expectation_suite(weather_fg, weather_expectation_suite)

Deleted existing expectation suite for FG 'air_quality'.
Attached expectation suite to Feature Group, edit it at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1952082
Saved expectation suite for FG 'air_quality'.
Deleted existing expectation suite for FG 'weather'.
Attached expectation suite to Feature Group, edit it at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1945998
Saved expectation suite for FG 'weather'.


## 1.7. Create Complete Feature View

In [None]:
def create_feature_view(fs, air_quality_fg, weather_fg):
    # Select specific columns from weather_fg, excluding 'date' and 'sensor_id' since they're join keys
    weather_features = [f.name for f in weather_fg.features if f.name not in ['date', 'sensor_id']]
    
    query = (
        air_quality_fg.select_all()
        .join(weather_fg.select(weather_features), on=["sensor_id", "date"])
    )

    fv = fs.get_or_create_feature_view(
        name="air_quality_complete_fv",
        version=1,
        query=query,
        labels=["pm25"]
    )

    return fv

air_quality_fv = create_feature_view(fs, air_quality_fg, weather_fg)

In [None]:
# def create_feature_view(fs, air_quality_fg, weather_fg):
#     query = (
#         air_quality_fg.select_all()
#         .join(weather_fg.select_all(), on=["sensor_id", "date"])
#     )

#     fv = fs.get_or_create_feature_view(
#         name="air_quality_complete_fv",
#         version=1,
#         query=query,
#         labels=["pm25"]
#     )

#     return fv

# air_quality_fv = create_feature_view(fs, air_quality_fg, weather_fg)

## 1.8. Trigger Offline Feature Store Materialization
After backfilling data, materialize the feature groups to populate the offline feature store for training.

In [None]:
print("üîÑ Starting materialization jobs for feature groups...")

try:
    # Trigger materialization for air quality feature group
    air_quality_job = air_quality_fg.materialization_job
    air_quality_job.run(await_termination=False)
    print(f"‚úÖ Air quality feature group materialization started")
except Exception as e:
    print(f"‚ö†Ô∏è Air quality materialization: {e}")

try:
    # Trigger materialization for weather feature group
    weather_job = weather_fg.materialization_job
    weather_job.run(await_termination=False)
    print(f"‚úÖ Weather feature group materialization started")
except Exception as e:
    print(f"‚ö†Ô∏è Weather materialization: {e}")

print("\nüìù Note: Materialization jobs run asynchronously. Check Hopsworks UI for status.")

üîÑ Starting materialization jobs for feature groups...
Use fg.materialization_job.run(args=None) to trigger the materialization job again.

‚úÖ Air quality feature group materialization started
Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_1_offline_fg_materialization/executions
‚úÖ Weather feature group materialization started

üìù Note: Materialization jobs run asynchronously. Check Hopsworks UI for status.


## 1.9. Data Exploration

In [None]:
# air_quality_df = air_quality_fg.read()
# weather_df = weather_fg.read()

# # Extract unique sensor metadata from air quality feature group
# metadata_df = air_quality_df[["sensor_id", "city", "street", "country", "latitude", "longitude"]].drop_duplicates(subset=["sensor_id"])
# print(f"üìç Extracted metadata for {len(metadata_df)} unique sensors")

In [None]:
# print("üîç AIR QUALITY DATA EXPLORATION")
# print("="*40)

# print(f"Shape: {air_quality_df.shape}")
# print(f"Date range: {air_quality_df['date'].min().date()} to {air_quality_df['date'].max().date()}")
# print(f"Number of unique sensors: {air_quality_df['sensor_id'].nunique()}")
# print(f"Countries: {metadata_df['country'].unique()}")
# print(f"Cities: {metadata_df['city'].nunique()} unique cities")

# print("\nüìä PM2.5 Statistics:")
# print(air_quality_df['pm25'].describe())
# print(f"Missing values: {air_quality_df['pm25'].isna().sum()}")

# print("\nüìà Engineered Features Statistics:")
# for col in ['pm25_rolling_3d', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_nearby_avg']:
#     if col in air_quality_df.columns:
#         missing = air_quality_df[col].isna().sum()
#         print(f"{col}: {missing} missing values ({missing/len(air_quality_df)*100:.1f}%)")


In [None]:
# print("üå§Ô∏è WEATHER DATA EXPLORATION") 
# print("="*40)

# print(f"Shape: {weather_df.shape}")
# print(f"Date range: {weather_df['date'].min().date()} to {weather_df['date'].max().date()}")
# print(f"Number of unique sensors: {metadata_df['sensor_id'].nunique()}")

# print("\nüå°Ô∏è Weather Statistics:")
# for col in ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']:
#     if col in weather_df.columns:
#         print(f"{col}:")
#         print(f"  Range: {weather_df[col].min():.2f} to {weather_df[col].max():.2f}, Mean: {weather_df[col].mean():.2f}, Missing: {weather_df[col].isna().sum()}")

# print("\nüìç Geographic Coverage:")
# print(f"Latitude range: {metadata_df['latitude'].min():.3f} to {metadata_df['latitude'].max():.3f}, Longitude range: {metadata_df['longitude'].min():.3f} to {metadata_df['longitude'].max():.3f}")

In [None]:
# print("üîó DATA QUALITY & RELATIONSHIPS")
# print("="*40)

# # Overall data completeness
# sensor_day_counts = air_quality_df.groupby('sensor_id')['date'].count()
# total_records = len(air_quality_df)
# data_completeness = (1 - air_quality_df['pm25'].isna().sum() / total_records) * 100

# print(f"üìä Overall Data Quality:")
# print(f"Total records: {total_records:,}")
# print(f"Data completeness: {data_completeness:.1f}%")
# print(f"Days per sensor - Min: {sensor_day_counts.min()}, Median: {sensor_day_counts.median():.0f}, Max: {sensor_day_counts.max()}")
# print(f"Sensors with <30 days: {(sensor_day_counts < 30).sum()}, >365 days: {(sensor_day_counts > 365).sum()}")

# # Extreme values summary
# extreme_count = (air_quality_df['pm25'] > 100).sum()
# very_high_count = (air_quality_df['pm25'] > 50).sum()
# print(f"\n‚ö†Ô∏è Air Quality Levels:")
# print(f"Extreme readings (>100 Œºg/m¬≥): {extreme_count} ({extreme_count/total_records*100:.1f}%)")
# print(f"Very high readings (>50 Œºg/m¬≥): {very_high_count} ({very_high_count/total_records*100:.1f}%)")

# # Seasonal patterns
# if len(air_quality_df) > 0:
#     # Create temporary month column without modifying original DataFrame
#     temp_months = pd.to_datetime(air_quality_df['date']).dt.month
#     monthly_pm25 = air_quality_df.groupby(temp_months)['pm25'].mean()
#     print(f"\nüóìÔ∏è Seasonal Patterns (PM2.5 Œºg/m¬≥):")
#     seasons = {(12,1,2): "Winter", (3,4,5): "Spring", (6,7,8): "Summer", (9,10,11): "Autumn"}
#     for months, season in seasons.items():
#         season_avg = monthly_pm25[monthly_pm25.index.isin(months)].mean()
#         print(f"  {season}: {season_avg:.1f}")