In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip subdirectories from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('src',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from src import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/filipsjostrand/School/ht_25/ID2223/pm25-forecast-openmeteo-aqicn
Added the following directory to the PYTHONPATH: /Users/filipsjostrand/School/ht_25/ID2223/pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


In [2]:
import datetime
import time
import requests
import pandas as pd
import great_expectations as ge
import hopsworks
from src.airquality import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

# Setup


In [8]:
today = datetime.date.today()
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 

# taken from ~/.env. You can also replace settings.AQICN_API_KEY with the api key value as a string "...."
if settings.AQICN_API_KEY is None:
    print("You need to set AQICN_API_KEY either in this cell or in ~/.env")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

print(f"Found AQICN_API_KEY: {AQICN_API_KEY}")

secrets = hopsworks.get_secrets_api()
# Replace any existing secret with the new value
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
        print("Replacing existing AQICN_API_KEY")
except hopsworks.RestAPIError as e:
    # Only log the RestAPIError if it is for "not found", otherwise re-raise
    if hasattr(e, 'error_code') and getattr(e, 'error_code', None) == 160048:
        # Secret does not exist; proceed to create it.
        pass
    elif "Could not find Secret" in str(e):
        # Fallback check if error does not attach error_code attribute
        pass
    else:
        raise

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)



aq_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"pm25",
            "min_value":-0.1,
            "max_value":500.0,
            "strict_min":True
        }
    )
)


weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_greater_than_zero(col):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column":col,
                "min_value":-0.1,
                "max_value":1000.0,
                "strict_min":True
            }
        )
    )
expect_greater_than_zero("precipitation_sum")
expect_greater_than_zero("wind_speed_10m_max")

2025-11-10 17:20:30,795 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-11-10 17:20:30,800 INFO: Initializing external client
2025-11-10 17:20:30,800 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-10 17:20:32,176 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279172
Found AQICN_API_KEY: 678c20804fe29fabcba94415ab0292823dedb14a
Replacing existing AQICN_API_KEY
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


# Backfill Data for the Gothenburg Area

## Methods

## preprocessing

In [13]:
def read_sensor_data(file_path):
    """
    Reads the sensor data from the CSV file. The first three rows contains metadata.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        street, city, country = [s.strip() for s in f.readline().strip().lstrip("# Sensor ").split('(')[0].strip().split(",")]
        url_line = f.readline().strip().lstrip("# ").strip()
        sensor_id = url_line.split('@')[1].split('/')[0]
        _ = f.readline().strip()
    df = pd.read_csv(file_path, skiprows=3)
    feed_url = f"https://api.waqi.info/feed/A{sensor_id}/"
    return df, street, city, country, feed_url

def clean_and_append_data(df, street, city, country, feed_url):
    """
    Remove any unused columns, set the daily median value to pm25. Remove NaN's and append the metadata.
    """
    clean_df = pd.DataFrame()
    clean_df['date'] = pd.to_datetime(df['date'])
    clean_df['pm25'] = df['median']
    clean_df = clean_df.dropna(subset=['pm25'])
    clean_df['street'] = street
    clean_df['city'] = city
    clean_df['country'] = country
    clean_df['feed_url'] = feed_url
    return clean_df


def get_historical_weather(city, df, today, feed_url):
    
    earliest_aq_date = pd.Series.min(df['date'])
    earliest_aq_date = earliest_aq_date.strftime('%Y-%m-%d')
    response = requests.get(f"{feed_url}/?token={AQICN_API_KEY}")
    data = response.json()
    latitude = data['data']['city']['geo'][0]
    longitude = data['data']['city']['geo'][1]
    max_retries = 5
    attempt = 0
    while attempt < max_retries:
        try:
            return util.get_historical_weather(city, earliest_aq_date, str(today), latitude, longitude)
        except Exception as e:
            # Catch OpenMeteoRequestsError specifically if it's available, fallback to any exception
            if hasattr(e, "args") and any("Minutely API request limit exceeded" in str(a) for a in e.args):
                wait_time = 70  # wait for just over one minute
                print(f"OpenMeteo API limit exceeded, retrying in {wait_time} seconds... (Attempt {attempt + 1} of {max_retries})")
                time.sleep(wait_time)
                attempt += 1
            elif "Minutely API request limit exceeded" in str(e):
                wait_time = 70  # wait for just over one minute
                print(f"OpenMeteo API limit exceeded, retrying in {wait_time} seconds... (Attempt {attempt + 1} of {max_retries})")
                time.sleep(wait_time)
                attempt += 1
            else:
                raise
    raise RuntimeError("Failed to obtain historical weather after multiple retries due to API rate limits.")

## Hopsworks

In [14]:
def create_air_quality_feature_group(name):
    air_quality_fg = fs.get_or_create_feature_group(
    name=f'air_quality_{name}',
    description=f'Air Quality characteristics of each day for {name}',
    version=1,
    primary_key=['country','city', 'street'],
    event_time="date",
    expectation_suite=aq_expectation_suite
    )
    return air_quality_fg
    
def update_air_quality_description(air_quality_fg):
    air_quality_fg.update_feature_description("date", "Date of measurement of air quality")
    air_quality_fg.update_feature_description("country", "Country where the air quality was measured (sometimes a city in acqcn.org)")
    air_quality_fg.update_feature_description("city", "City where the air quality was measured")
    air_quality_fg.update_feature_description("street", "Street in the city where the air quality was measured")
    air_quality_fg.update_feature_description("pm25", "Particles less than 2.5 micrometers in diameter (fine particles) pose health risk")

def create_and_insert_air_quality_data(df, name):
    air_quality_fg = create_air_quality_feature_group(name)
    air_quality_fg.insert(df)
    update_air_quality_description(air_quality_fg)
    

def create_weather_feature_group(name):
    # Get or create feature group 
    weather_fg = fs.get_or_create_feature_group(
    name=f'weather_{name}',
    description=f'Weather characteristics of each day for {name}',
    version=1,
    primary_key=['city'],
    event_time="date",
    expectation_suite=weather_expectation_suite
) 
    return weather_fg

def update_weather_description(weather_fg):
    weather_fg.update_feature_description("date", "Date of measurement of weather")
    weather_fg.update_feature_description("city", "City where weather is measured/forecast for")
    weather_fg.update_feature_description("temperature_2m_mean", "Temperature in Celsius")
    weather_fg.update_feature_description("precipitation_sum", "Precipitation (rain/snow) in mm")
    weather_fg.update_feature_description("wind_speed_10m_max", "Wind speed at 10m abouve ground")
    weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant Wind direction over the dayd")

def create_and_insert_weather_data(df, name):
    weather_fg = create_weather_feature_group(name)
    update_weather_description(weather_fg)
    weather_fg.insert(df)

# script

In [5]:
data_dir = os.path.join(root_dir, "data")
dir_list = os.listdir(data_dir)
dfs ={}
for file in dir_list:
    file_path = os.path.join(data_dir, file)
    aq_df, street, city, country, feed_url = read_sensor_data(file_path)
    aq_df = clean_and_append_data(aq_df, street, city, country, feed_url)
    weather_df = get_historical_weather(city, aq_df, today, feed_url)
    dfs[file] = {"aq": aq_df, "weather": weather_df}

Coordinates 57.680137634277344°N 12.025862693786621°E
Elevation 20.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.680137634277344°N 12.025862693786621°E
Elevation 12.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.680137634277344°N 12.025862693786621°E
Elevation 76.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75043869018555°N 12.051836013793945°E
Elevation 51.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.680137634277344°N 12.025862693786621°E
Elevation 21.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.609840393066406°N 12.193548202514648°E
Elevation 55.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75043869018555°N 11.857451438903809°E
Elevation 7.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.680137634277344°N 12.025862693786621°E
Elevation 24.0 m asl
Timezone None None
Timezone difference 

In [None]:
locations = {}
for file, record in dfs.items():
    aq = record.get("aq", {})
    country = record.get("country", None)
    city = record.get("city", None)
    street = record.get("street", None)
    aqicn_url = record.get("feed_url", None)
    latitude = record.get("latitude", None)
    longitude = record.get("longitude", None)

    dict_obj = {
        "country": country,
        "city": city,
        "street": street,
        "aqicn_url": aqicn_url,
        "latitude": latitude,
        "longitude": longitude
    }
    locations[file] = dict_obj

# Convert the dictionary to a JSON string
str_dict = json.dumps(locations)

try:
    secret = secrets.get_secret("SENSOR_LOCATIONS_JSON")
    if secret is not None:
        secret.delete()
        print("Replacing existing SENSOR_LOCATIONS_JSON")
except hopsworks.RestAPIError as e:
    # Only log the RestAPIError if it is for "not found", otherwise re-raise
    if hasattr(e, 'error_code') and getattr(e, 'error_code', None) == 160048:
        # Secret does not exist; proceed to create it.
        pass
    elif "Could not find Secret" in str(e):
        # Fallback check if error does not attach error_code attribute
        pass
    else:
        raise

secrets.create_secret("SENSOR_LOCATIONS_JSON", str_dict)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('SENSOR_LOCATIONS_JSON', 'PRIVATE')

In [None]:
for file, record in dfs.items():
    name = file.split("/")[-1].split(".")[0]
    create_and_insert_air_quality_data(record["aq"], name)
    create_and_insert_weather_data(record["weather"], name)

2025-11-10 17:36:38,886 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279172/fs/1265787/fg/1668642


Uploading Dataframe: 100.00% |██████████| Rows 2077/2077 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_59893_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279172/jobs/named/air_quality_59893_1_offline_fg_materialization/executions


KeyError: "'FeatureGroup' object has no feature called 'date'."