# 1. Backfill

## 1.1. Setup

In [None]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata

#  Load settings 
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-07 11:25:21,385 INFO: Initializing external client
2026-01-07 11:25:21,385 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-07 11:25:23,201 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


Repository management

In [2]:
def clone_or_update_repo(username: str):
    repo_name = "pm25-forecast-openmeteo-aqicn"

    # 1. Detect if already inside the repo
    cwd = Path().absolute()
    for parent in [cwd] + list(cwd.parents):
        if (parent / ".git").exists() and parent.name == repo_name:
            print(f"Already in repo at {parent}")
            return parent

    # 2. Detect if the repo exists in the current directory
    repo_dir = Path(repo_name)
    if repo_dir.exists():
        print(f"Repository exists at {repo_dir.absolute()}")
        os.system(f"git -C {repo_dir} pull")
        return repo_dir

    # 3. Otherwise clone it
    print("Cloning repository...")
    url = f"https://github.com/{username}/{repo_name}.git"
    exit_code = os.system(f"git clone {url}")

    if exit_code != 0:
        raise RuntimeError("Git clone failed.")

    print("Clone successful.")
    return repo_dir

repo_dir = clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)


Already in repo at c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn


In [3]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

## 1.2. Create Feature Groups

In [4]:
air_quality_fg, sensor_metadata_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 1.3. Check and Backfill
Only performed when done for the first time. 

For 100 sensors and 5 years this will take approximately 150 minutes = 2.5 hours

In [None]:
# Check if data exists
try:
    aq_data = air_quality_fg.read()
    is_first_run = len(aq_data) == 0
except:
    is_first_run = True

# Process and insert data if first run
if is_first_run:
    all_aq_dfs = []
    all_weather_dfs = []
    locations = {}

    # Process CSV files in data directory
    data_dir = os.path.join(root_dir, "data")
    dir_list = os.listdir(data_dir)
    metadata_df = sensor_metadata_fg.read().set_index("sensor_id")
    for file in dir_list:
        if file.endswith(".csv"):
            file_path = os.path.join(data_dir, file)
            aq_df_raw, street, city, country, feed_url, sensor_id = metadata.read_sensor_data(file_path, AQICN_API_KEY)
            
            # Clean and process
            aq_df = cleaning.clean_and_append_data(aq_df_raw, street, city, country, feed_url, sensor_id)
            aq_df["date"] = aq_df["date"].dt.tz_localize(None)

            # start_date = aq_df["date"].min().date()
            start_date = end_date - timedelta(days=365 * 3)
            end_date = aq_df["date"].max().date()

            meta = metadata_df.loc[sensor_id]
            latitude = meta["latitude"]
            longitude = meta["longitude"]
            city = meta["city"]

            weather_df = fetchers.get_historical_weather(city, start_date, end_date, latitude, longitude)

            if weather_df is None or len(weather_df) == 0:
                print(f"‚ö†Ô∏è No historical weather for sensor {sensor_id}, skipping.")
                continue

            weather_df["date"] = weather_df["date"].dt.tz_localize(None)

            all_aq_dfs.append(aq_df)
            all_weather_dfs.append(weather_df)
            locations[sensor_id] = {
                "country": country,
                "city": city,
                "street": street,
                "aqicn_url": feed_url,
                "latitude": latitude,
                "longitude": longitude,
            }

    if all_aq_dfs:
        # Combine and engineer features
        aq_df_all = pd.concat(all_aq_dfs, ignore_index=True)
        weather_df_all = pd.concat(all_weather_dfs, ignore_index=True)

        aq_df_all = feature_engineering.add_rolling_window_feature(aq_df_all, window_days=3, column="pm25", new_column="pm25_rolling_3d")
        aq_df_all = feature_engineering.add_lagged_features(aq_df_all, column="pm25", lags=[1, 2, 3])
        aq_df_all = feature_engineering.add_nearby_sensor_feature(aq_df_all, locations, column="pm25_lag_1d", n_closest=3)
        
        air_quality_fg.insert(aq_df_all)
        weather_fg.insert(weather_df_all)

        # Insert sensor metadata
        metadata_records = []
        for sensor_id, loc in locations.items():
            metadata_records.append({
                "sensor_id": sensor_id,
                "country": loc["country"],
                "city": loc["city"],
                "street": loc["street"],
                "aqicn_url": loc["aqicn_url"],
                "latitude": loc["latitude"],
                "longitude": loc["longitude"],
            })
        sensor_metadata_fg.insert(pd.DataFrame(metadata_records))
    
        print(f"‚úÖ Inserted {len(aq_df_all)} air quality records")
        print(f"‚úÖ Inserted {len(weather_df_all)} weather records")
        print(f"‚úÖ Inserted {len(metadata_records)} sensor metadata records")
    else:
        print("‚ö†Ô∏è No CSV files processed")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.74s) 


OpenMeteoRequestsError: failed to request 'https://archive-api.open-meteo.com/v1/archive': {'error': True, 'reason': 'Daily API request limit exceeded. Please try again tomorrow.'}

## 1.4. Update Descriptions

In [None]:
hopsworks_admin.update_air_quality_description(air_quality_fg)
hopsworks_admin.update_sensor_metadata_description(sensor_metadata_fg)
hopsworks_admin.update_weather_description(weather_fg)

## 1.5. Validation Setup
Creates Great Expectations validation suites for air quality and weather data with column value constraints.

In [None]:
aq_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

# pm25 should be >= 0
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "pm25",
            "min_value": -0.1,
            "max_value": None,
            "strict_min": True,
        },
    )
)

aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_dateutil_parseable",
        kwargs={"column": "date"},
    )
)

# sensor_id + date should be unique (PK)
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_compound_columns_to_be_unique",
        kwargs={"column_list": ["sensor_id", "date"]},
    )
)

# rolling + lag features should be numeric (float or int)
for col in ["pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d"]:
    aq_expectation_suite.add_expectation(
        gx.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_type_list",
            kwargs={"column": col, "type_list": ["float", "int"]},
        )
    )

hopsworks_admin.save_or_replace_expectation_suite(air_quality_fg, aq_expectation_suite)


weather_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_dateutil_parseable",
        kwargs={"column": "date"},
    )
)

# temperature should be within physical range
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "temperature_2m_mean", "min_value": -80, "max_value": 60},
    )
)

# latitude/longitude must be valid
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "latitude", "min_value": -90, "max_value": 90},
    )
)
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "longitude", "min_value": -180, "max_value": 180},
    )
)

# precipitation and wind speed should be >= 0 (but allow nulls)
for col in ["precipitation_sum", "wind_speed_10m_max"]:
    weather_expectation_suite.add_expectation(
        gx.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column": col,
                "min_value": -0.1,
                "max_value": None,
                "strict_min": True,
            },
        )
    )

hopsworks_admin.save_or_replace_expectation_suite(weather_fg, weather_expectation_suite)

## 1.6. Create Feature View

In [None]:
def create_feature_view(fs, air_quality_fg, weather_fg):
    query = (
        air_quality_fg.select_all()
        .join(weather_fg.select_all(), on=["sensor_id", "date"])
    )

    fv = fs.get_or_create_feature_view(
        name="air_quality_complete_fv",
        version=1,
        query=query,
        labels=["pm25"]
    )

    return fv


air_quality_fv = create_feature_view(fs, air_quality_fg, weather_fg)

## 1.7. Load Historical Data

In [None]:
try:
    metadata_df = sensor_metadata_fg.read()
    if len(metadata_df) == 0:
        print("‚ö†Ô∏è No sensor metadata found. Run first-time CSV processing first.")
    else:
        metadata_df = metadata_df.set_index("sensor_id")
        print(f"üìç Loaded metadata for {len(metadata_df)} sensors")
except Exception as e:
    print(f"‚ùå Error reading sensor metadata: {e}")
    metadata_df = pd.DataFrame()

historical_df = air_quality_fv.get_batch_data()

## 1.8. Incremental Updates

Detect latest timestamp per sensor

In [None]:
latest_per_sensor = (
    historical_df.groupby("sensor_id")["date"]
    .max()
    .to_dict()
)

incremental.run_incremental_update(
    sensor_metadata_fg,
    air_quality_fg,
    weather_fg,
    latest_per_sensor
)

## 1.9. Exploration

In [None]:
print("üîç AIR QUALITY DATA EXPLORATION")
print("="*40)
print(f"Shape: {aq_df_all.shape}")
print(f"Date range: {aq_df_all['date'].min().date()} to {aq_df_all['date'].max().date()}")
print(f"Number of unique sensors: {aq_df_all['sensor_id'].nunique()}")
print(f"Countries: {aq_df_all['country'].unique()}")
print(f"Cities: {aq_df_all['city'].nunique()} unique cities")

print("\nüìä PM2.5 Statistics:")
print(aq_df_all['pm25'].describe())
print(f"Missing values: {aq_df_all['pm25'].isna().sum()}")

print("\nüìà Engineered Features Statistics:")
for col in ['pm25_rolling_3d', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_nearby_avg']:
    if col in aq_df_all.columns:
        missing = aq_df_all[col].isna().sum()
        print(f"{col}: {missing} missing values ({missing/len(aq_df_all)*100:.1f}%)")

In [None]:
print("üå§Ô∏è WEATHER DATA EXPLORATION") 
print("="*40)
print(f"Shape: {weather_df_all.shape}")
print(f"Date range: {weather_df_all['date'].min().date()} to {weather_df_all['date'].max().date()}")
print(f"Number of unique sensors: {weather_df_all['sensor_id'].nunique()}")

print("\nüå°Ô∏è Weather Statistics:")
for col in ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']:
    if col in weather_df_all.columns:
        print(f"{col}:")
        print(f"  Range: {weather_df_all[col].min():.2f} to {weather_df_all[col].max():.2f}, Mean: {weather_df_all[col].mean():.2f}, Missing: {weather_df_all[col].isna().sum()}")

print("\nüìç Geographic Coverage:")
print(f"Latitude range: {weather_df_all['latitude'].min():.3f} to {weather_df_all['latitude'].max():.3f}, Longitude range: {weather_df_all['longitude'].min():.3f} to {weather_df_all['longitude'].max():.3f}")

In [None]:
print("üîó DATA QUALITY & RELATIONSHIPS")
print("="*40)

# Overall data completeness
sensor_day_counts = aq_df_all.groupby('sensor_id')['date'].count()
total_records = len(aq_df_all)
data_completeness = (1 - aq_df_all['pm25'].isna().sum() / total_records) * 100

print(f"üìä Overall Data Quality:")
print(f"Total records: {total_records:,}")
print(f"Data completeness: {data_completeness:.1f}%")
print(f"Days per sensor - Min: {sensor_day_counts.min()}, Median: {sensor_day_counts.median():.0f}, Max: {sensor_day_counts.max()}")
print(f"Sensors with <30 days: {(sensor_day_counts < 30).sum()}, >365 days: {(sensor_day_counts > 365).sum()}")

# Extreme values summary
extreme_count = (aq_df_all['pm25'] > 100).sum()
very_high_count = (aq_df_all['pm25'] > 50).sum()
print(f"\n‚ö†Ô∏è Air Quality Levels:")
print(f"Extreme readings (>100 Œºg/m¬≥): {extreme_count} ({extreme_count/total_records*100:.1f}%)")
print(f"Very high readings (>50 Œºg/m¬≥): {very_high_count} ({very_high_count/total_records*100:.1f}%)")

# Seasonal patterns
if len(aq_df_all) > 0:
    # Create temporary month column without modifying original DataFrame
    temp_months = pd.to_datetime(aq_df_all['date']).dt.month
    monthly_pm25 = aq_df_all.groupby(temp_months)['pm25'].mean()
    print(f"\nüóìÔ∏è Seasonal Patterns (PM2.5 Œºg/m¬≥):")
    seasons = {(12,1,2): "Winter", (3,4,5): "Spring", (6,7,8): "Summer", (9,10,11): "Autumn"}
    for months, season in seasons.items():
        season_avg = monthly_pm25[monthly_pm25.index.isin(months)].mean()
        print(f"  {season}: {season_avg:.1f}")