In [1]:
# Standard imports
import os
import sys
import json
import time
from datetime import datetime, timezone, date
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import hopsworks
import os
import time
from geopy.geocoders import Nominatim

#  Project imports
from utils import config, metadata

#  Load settings 
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-07 16:13:53,847 INFO: Initializing external client
2026-01-07 16:13:53,856 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-07 16:13:55,626 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


In [2]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

In [3]:
geolocator = Nominatim(user_agent="pm25-metadata-builder")

In [4]:
cache = {}

def geocode_cached(query):
    if query in cache:
        return cache[query]
    loc = geolocator.geocode(query)
    cache[query] = loc
    return loc

In [5]:
def clean_field(value):
    if value is None:
        return None
    if isinstance(value, float) and pd.isna(value):
        return None
    value = str(value).strip()
    if value.lower() in ("none", "nan", "", "unknown"):
        return None
    return value

In [6]:
def get_coordinates(city, street, country):
    candidates = []

    if street and city:
        candidates.append(f"{street}, {city}, {country}")
    if city:
        candidates.append(f"{city}, {country}")
    if street:
        candidates.append(f"{street}, {country}")
    candidates.append(country)

    for query in candidates:
        loc = geocode_cached(query)
        time.sleep(1)
        if loc:
            return loc.latitude, loc.longitude

    return None, None

Create dataframe

In [7]:
def build_metadata_from_csvs(data_dir, aqicn_api_key):
    rows = []

    for file in os.listdir(data_dir):
        if not file.endswith(".csv"):
            continue

        file_path = os.path.join(data_dir, file)

        aq_df_raw, street, city, country, feed_url, sensor_id = metadata.read_sensor_data(
            file_path, aqicn_api_key
        )

        street = clean_field(street)
        city = clean_field(city)
        country = clean_field(country)

        lat, lon = get_coordinates(city, street, country)

        if lat is None or lon is None:
            print(f"[SKIP] Sensor {sensor_id}: cannot geocode location")
            continue

        rows.append({
            "sensor_id": sensor_id,
            "city": city,
            "street": street,
            "country": country,
            "aqicn_url": feed_url,
            "latitude": lat,
            "longitude": lon,
        })

    return pd.DataFrame(rows)

df_meta = build_metadata_from_csvs("../data/", AQICN_API_KEY)

# Create a unique location_id for each unique (city, lat, lon)
df_meta["location_key"] = (
    df_meta["city"].astype(str) + "_" +
    df_meta["latitude"].astype(str) + "_" +
    df_meta["longitude"].astype(str)
)

# Map each unique location_key to an integer ID
unique_keys = df_meta["location_key"].unique()
location_map = {key: idx + 1 for idx, key in enumerate(unique_keys)}

df_meta["location_id"] = df_meta["location_key"].map(location_map)

# Drop helper column
df_meta = df_meta.drop(columns=["location_key"])

In [8]:
fg_meta = fs.get_or_create_feature_group(
    name="sensor_metadata",
    version=1,
    primary_key=["sensor_id"],
    description="Curated metadata extracted from AQICN CSVs",
    online_enabled=True
)

fg_meta.insert(df_meta)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1911213


Uploading Dataframe: 100.00% |██████████| Rows 103/103 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: sensor_metadata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/sensor_metadata_1_offline_fg_materialization/executions


(Job('sensor_metadata_1_offline_fg_materialization', 'SPARK'), None)

In [9]:
print("Sensor metadata feature group created/updated successfully.")

Sensor metadata feature group created/updated successfully.
