## 1.1. Environment Setup
Detect if running in Google Colab or local environment, handle repository cloning, dependency installation, numpy compatibility fixes, and set up Python path.

In [None]:
import sys
from pathlib import Path
import warnings
import os

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    try:
        if "google.colab" in str(get_ipython()):
            return True
    except:
        pass
    return False

def clone_repository() -> None:
    # Check if repository already exists
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

def fix_numpy_compatibility():
    print("Fixing numpy compatibility for hopsworks/pandas...")
    try:
        # Try different approaches to fix numpy
        !pip install --force-reinstall --no-deps numpy==1.24.3
        !pip install --force-reinstall pandas==2.0.3
        print("Numpy and pandas fixed. Please restart runtime and run again.")
    except Exception as e:
        print(f"Fix attempt 1 failed: {e}")
        try:
            # Alternative approach
            !pip uninstall -y numpy pandas
            !pip install numpy==1.24.3 pandas==2.0.3
            print("Numpy and pandas reinstalled. Please restart runtime and run again.")
        except Exception as e2:
            print(f"Fix attempt 2 failed: {e2}")
            print("Please manually restart runtime and try again.")


if is_google_colab():
    try:
        import numpy
        numpy.array([1, 2, 3])
        import pandas as pd
        import great_expectations  # Test this import early
        print("Basic packages working correctly")
        
        # Continue with the rest...
        clone_repository()
        install_dependencies()
        import hopsworks
        print("All packages working correctly")
        
    except Exception as e:  # Catch any import error
        print(f"Package compatibility issue detected: {e}")
        fix_numpy_compatibility()
        raise SystemExit("Please restart runtime (Runtime > Restart runtime) and run the notebook again.")

else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

from utils import config

if is_google_colab():
    from google.colab import userdata
    import hopsworks
    project = hopsworks.login(
        api_key_value=userdata.get('HOPSWORKS_API_KEY'),
        engine="python"
    )
    AQICN_API_KEY = userdata.get('AQICN_API_KEY')
    
else:
    # Local development - use .env file
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


## 1.2. Imports

In [10]:
import datetime
import time
import requests
import pandas as pd
import great_expectations as ge
import hopsworks
from utils import airquality
import json
import warnings
warnings.filterwarnings("ignore")

## 1.3. Setup
Configure Hopsworks connection, feature store access, and AQICN API key handling

In [11]:
today = datetime.date.today()

if is_google_colab():
    # Use existing project and API key from first cell
    fs = project.get_feature_store()
    secrets = hopsworks.get_secrets_api()
else:
    # Local development
    project = hopsworks.login(engine="python")
    fs = project.get_feature_store()
    
    if settings.AQICN_API_KEY is None:
        print("You need to set AQICN_API_KEY either in this cell or in ~/.env")
        sys.exit(1)

    AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

    secrets = hopsworks.get_secrets_api()
    try:
        secret = secrets.get_secret("AQICN_API_KEY")
        if secret is not None:
            secret.delete()
            print("Replacing existing AQICN_API_KEY")
    except hopsworks.RestAPIError as e:
        if hasattr(e, "error_code") and getattr(e, "error_code", None) == 160048:
            pass
        elif "Could not find Secret" in str(e):
            pass
        else:
            raise

    secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

2025-12-01 14:30:02,573 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-12-01 14:30:02,583 INFO: Initializing external client
2025-12-01 14:30:02,583 INFO: Base URL: https://c.app.hopsworks.ai:443
Connection closed.
2025-12-01 14:30:02,583 INFO: Initializing external client
2025-12-01 14:30:02,583 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-01 14:30:04,011 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184
Replacing existing AQICN_API_KEY
Replacing existing AQICN_API_KEY
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


## 1.4. Sensor Mode
Set SENSOR_CSV_FILE in .env with the relative path to a sensor to process it, or leave it unset to process all sensors in the `data` folder.

Determines whether to process a single sensor or all sensors based on environment settings.

In [12]:
if is_google_colab():
    # In Colab, process all sensors (no single sensor mode)
    sensor_csv_file = None
    print("Running in Colab - processing all sensors")
else:
    # Local development - check for single sensor mode
    sensor_csv_file = getattr(settings, 'SENSOR_CSV_FILE', None)
    if sensor_csv_file:
        print(f"Single sensor mode: {sensor_csv_file}")
    else:
        print("Batch mode - processing all sensors")

Batch mode - processing all sensors


## 1.5. Data Validation Setup
Creates Great Expectations validation suites for air quality and weather data with column value constraints.

In [13]:
aq_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "pm25",
            "min_value": -0.1,
            "max_value": 500.0,
            "strict_min": True,
        },
    )
)

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_greater_than_zero(col):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column": col,
                "min_value": -0.1,
                "max_value": 1000.0,
                "strict_min": True,
            },
        )
    )


expect_greater_than_zero("precipitation_sum")
expect_greater_than_zero("wind_speed_10m_max")

## 1.6. Helper Methods
Data processing functions - clean air quality data and fetch historical weather data with API rate limiting and retry logic.

In [14]:
def clean_and_append_data(df, street, city, country, feed_url, sensor_id):
    """
    Remove any unused columns, set the daily median value to pm25. Remove NaN's and append the metadata.
    """
    clean_df = pd.DataFrame()
    clean_df["date"] = pd.to_datetime(df["date"])
    clean_df["pm25"] = df["median"]
    clean_df = clean_df.dropna(subset=["pm25"])
    clean_df["sensor_id"] = sensor_id
    clean_df["street"] = street
    clean_df["city"] = city
    clean_df["country"] = country
    clean_df["feed_url"] = feed_url
    return clean_df


def get_historical_weather(city, df, today, feed_url, sensor_id):
    earliest_aq_date = pd.Series.min(df["date"])
    earliest_aq_date = earliest_aq_date.strftime("%Y-%m-%d")
    response = requests.get(f"{feed_url}/?token={AQICN_API_KEY}")
    data = response.json()
    latitude = data["data"]["city"]["geo"][0]
    longitude = data["data"]["city"]["geo"][1]
    max_retries = 5
    attempt = 0
    while attempt < max_retries:
        try:
            weather_df = airquality.get_historical_weather(
                city, earliest_aq_date, str(today), latitude, longitude
            )
            weather_df["sensor_id"] = sensor_id
            weather_df["city"] = city
            weather_df["latitude"] = latitude
            weather_df["longitude"] = longitude
            return weather_df, latitude, longitude
        except Exception as e:
            if hasattr(e, "args") and any(
                "Minutely API request limit exceeded" in str(a) for a in e.args
            ):
                wait_time = 70
                print(
                    f"OpenMeteo API limit exceeded, retrying in {wait_time} seconds... (Attempt {attempt + 1} of {max_retries})"
                )
                time.sleep(wait_time)
                attempt += 1
            elif "Minutely API request limit exceeded" in str(e):
                wait_time = 70
                print(
                    f"OpenMeteo API limit exceeded, retrying in {wait_time} seconds... (Attempt {attempt + 1} of {max_retries})"
                )
                time.sleep(wait_time)
                attempt += 1
            else:
                raise
    raise RuntimeError(
        "Failed to obtain historical weather after multiple retries due to API rate limits."
    )

## 1.7. Hopsworks
Feature Group Management - functions to create and manage air quality and weather feature groups in Hopsworks, including schema descriptions.

In [15]:
def create_air_quality_feature_group():
    air_quality_fg = fs.get_or_create_feature_group(
        name="air_quality_all",
        description="Air Quality characteristics of each day for all sensors",
        version=1,
        primary_key=["sensor_id"],
        event_time="date",
        expectation_suite=aq_expectation_suite,
    )
    return air_quality_fg


def update_air_quality_description(air_quality_fg):
    air_quality_fg.update_feature_description(
        "date", "Date of measurement of air quality"
    )
    air_quality_fg.update_feature_description(
        "sensor_id", "AQICN sensor identifier (e.g., 59893)"
    )
    air_quality_fg.update_feature_description(
        "country",
        "Country where the air quality was measured (sometimes a city in aqicn.org)",
    )
    air_quality_fg.update_feature_description(
        "city", "City where the air quality was measured"
    )
    air_quality_fg.update_feature_description(
        "street", "Street in the city where the air quality was measured"
    )
    air_quality_fg.update_feature_description(
        "pm25",
        "Particles less than 2.5 micrometers in diameter (fine particles) pose health risk",
    )
    air_quality_fg.update_feature_description(
        "pm25_rolling_3d",
        "3-day rolling mean of PM2.5 from previous days (lagged by 1 day for point-in-time correctness).",
    )
    air_quality_fg.update_feature_description(
        "pm25_lag_1d",
        "PM2.5 value from 1 day ago.",
    )
    air_quality_fg.update_feature_description(
        "pm25_lag_2d",
        "PM2.5 value from 2 days ago.",
    )
    air_quality_fg.update_feature_description(
        "pm25_lag_3d",
        "PM2.5 value from 3 days ago.",
    )
    air_quality_fg.update_feature_description(
        "pm25_nearby_avg",
        "Average PM2.5 value from the 3 closest sensors.",
    )


def create_and_insert_air_quality_data(df):
    air_quality_fg = create_air_quality_feature_group()
    air_quality_fg.insert(df)
    update_air_quality_description(air_quality_fg)


def create_weather_feature_group():
    weather_fg = fs.get_or_create_feature_group(
        name="weather_all",
        description="Weather characteristics of each day for all sensors",
        version=1,
        primary_key=["sensor_id"],
        event_time="date",
        expectation_suite=weather_expectation_suite,
    )
    return weather_fg


def update_weather_description(weather_fg):
    weather_fg.update_feature_description("date", "Date of measurement of weather")
    weather_fg.update_feature_description(
        "sensor_id", "AQICN sensor identifier (e.g., 59893)"
    )
    weather_fg.update_feature_description(
        "city", "City where weather is measured/forecast for"
    )
    weather_fg.update_feature_description(
        "temperature_2m_mean", "Temperature in Celsius"
    )
    weather_fg.update_feature_description(
        "precipitation_sum", "Precipitation (rain/snow) in mm"
    )
    weather_fg.update_feature_description(
        "wind_speed_10m_max", "Wind speed at 10m above ground"
    )
    weather_fg.update_feature_description(
        "wind_direction_10m_dominant", "Dominant Wind direction over the days"
    )
    weather_fg.update_feature_description(
        "latitude", "Latitude of sensor location used for weather retrieval"
    )
    weather_fg.update_feature_description(
        "longitude", "Longitude of sensor location used for weather retrieval"
    )


def create_and_insert_weather_data(df):
    weather_fg = create_weather_feature_group()
    weather_fg.insert(df)
    update_weather_description(weather_fg)

## 1.8. Script
Main processing logic - processes sensor data (single or batch mode), cleans data, fetches weather data, adds rolling averages and lagged features, and combines all data

In [16]:
all_aq_dfs = []
all_weather_dfs = []
locations = {}

if sensor_csv_file and os.path.exists(sensor_csv_file):
    # Single sensor mode if csv_file provided
    aq_df_raw, street, city, country, feed_url, sensor_id = airquality.read_sensor_data(sensor_csv_file)
    aq_df = clean_and_append_data(aq_df_raw, street, city, country, feed_url, sensor_id)
    weather_df, latitude, longitude = get_historical_weather(
        city, aq_df, today, feed_url, sensor_id
    )
    all_aq_dfs.append(aq_df)
    all_weather_dfs.append(weather_df)
    locations[sensor_id] = {
        "country": country,
        "city": city,
        "street": street,
        "aqicn_url": feed_url,
        "latitude": latitude,
        "longitude": longitude,
    }
else:
    # Process all files in data directory
    data_dir = os.path.join(root_dir, "data_sweden")
    dir_list = os.listdir(data_dir)
    for file in dir_list:
        file_path = os.path.join(data_dir, file)
        # Skip directories and only process CSV files
        if os.path.isdir(file_path) or not file.endswith('.csv'):
            continue

        try:
            aq_df_raw, street, city, country, feed_url, sensor_id = airquality.read_sensor_data(file_path)
            aq_df = clean_and_append_data(aq_df_raw, street, city, country, feed_url, sensor_id)
            weather_df, latitude, longitude = get_historical_weather(
                city, aq_df, today, feed_url, sensor_id
            )
        except Exception as e:
            print(f"⚠️  Skipping file {file}: {type(e).__name__}: {e}")
            continue

        all_aq_dfs.append(aq_df)
        all_weather_dfs.append(weather_df)
        locations[sensor_id] = {
            "country": country,
            "city": city,
            "street": street,
            "aqicn_url": feed_url,
            "latitude": latitude,
            "longitude": longitude,
        }

# Concatenate into single, uniform dfs
aq_df_all = pd.concat(all_aq_dfs, ignore_index=True)
weather_df_all = pd.concat(all_weather_dfs, ignore_index=True)
aq_df_all = airquality.add_rolling_window_feature(aq_df_all, window_days=3, column="pm25", new_column="pm25_rolling_3d")
aq_df_all = airquality.add_lagged_features(aq_df_all, column="pm25", lags=[1, 2, 3])
aq_df_all = airquality.add_nearby_sensor_feature(aq_df_all, locations, column="pm25_lag_1d", n_closest=3)

⚠️  Skipping file 362923.csv: TypeError: string indices must be integers, not 'str'
⚠️  Skipping file 59899.csv: ValueError: not enough values to unpack (expected 3, got 2)
⚠️  Skipping file 59899.csv: ValueError: not enough values to unpack (expected 3, got 2)
OpenMeteo API limit exceeded, retrying in 70 seconds... (Attempt 1 of 5)
OpenMeteo API limit exceeded, retrying in 70 seconds... (Attempt 1 of 5)
⚠️  Skipping file 63646.csv: ValueError: not enough values to unpack (expected 3, got 2)
⚠️  Skipping file 63646.csv: ValueError: not enough values to unpack (expected 3, got 2)
OpenMeteo API limit exceeded, retrying in 70 seconds... (Attempt 1 of 5)
OpenMeteo API limit exceeded, retrying in 70 seconds... (Attempt 1 of 5)


## 1.9. Exploration

Air quality data info - display information about the processed air quality DataFrame

In [17]:
# aq_df_all.info()

Weather data info - displays information about the processed weather DataFrame

In [18]:
# weather_df_all.info()

## 1.10. Store Sensor Location
Create Hopsworks secrets for each sensor's location metadata (coordinates, address, etc.)

In [19]:
for sensor_id, location in locations.items():
    secret_name = f"SENSOR_LOCATION_JSON_{sensor_id}"
    location_str = json.dumps(location)
    
    try:
        secret = secrets.get_secret(secret_name)
        if secret is not None:
            secret.delete()
            print(f"Replacing existing {secret_name}")
    except hopsworks.RestAPIError as e:
        if hasattr(e, "error_code") and getattr(e, "error_code", None) == 160048:
            pass
        elif "Could not find Secret" in str(e):
            pass
        else:
            raise
    
    secrets.create_secret(secret_name, location_str)
    print(f"Created secret: {secret_name}\n")

Replacing existing SENSOR_LOCATION_JSON_112672
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_112672

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_112672

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_122302

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_122302

Replacing existing SENSOR_LOCATION_JSON_129124
Replacing existing SENSOR_LOCATION_JSON_129124
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_129124

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Created secret: SENSOR_LOCATION_JSON_129124

Replacing existing SENSOR_LOCATION_JSON_154549
Re

## 1.11. Upload to Hopsworks
Insert the processed air quality and weather data into Hopsworks feature groups

In [20]:
create_and_insert_air_quality_data(aq_df_all)
create_and_insert_weather_data(weather_df_all)

2025-12-01 14:35:14,098 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972


Uploading Dataframe: 100.00% |██████████| Rows 91453/91453 | Elapsed Time: 00:04 | Remaining Time: 00:00



Launching job: air_quality_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-12-01 14:35:39,874 INFO: 	2 expectation(s) included in expectation_suite.
2025-12-01 14:35:39,874 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130


Uploading Dataframe: 100.00% |██████████| Rows 102096/102096 | Elapsed Time: 00:03 | Remaining Time: 00:00



Launching job: weather_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions
