# 2. Feature Pipeline

## 2.1. Environment Setup
Detect if running in Google Colab or local environment, handle repository cloning, dependency installation, numpy compatibility fixes, and set up Python path

In [18]:
import sys
from pathlib import Path
import warnings
import os

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    try:
        if "google.colab" in str(get_ipython()):
            return True
    except:
        pass
    return False

def clone_repository() -> None:
    # Check if repository already exists
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

def fix_numpy_compatibility():
    print("Fixing numpy compatibility for hopsworks/pandas...")
    try:
        # Use precompiled wheels with compatible versions
        !pip install --force-reinstall numpy==1.24.4 pandas==2.0.3
        print("Numpy and pandas fixed. Please restart runtime and run again.")
    except Exception as e:
        print(f"Fix attempt failed: {e}")
        print("Please manually restart runtime and try again.")

if is_google_colab():
    try:
        import numpy
        numpy.array([1, 2, 3])
        import pandas as pd
        print("Basic packages working correctly")

        clone_repository()
        install_dependencies()

        import hopsworks
        print("All packages working correctly")

        root_dir = str(Path().absolute())
        print("Google Colab environment")
        
    except (ValueError, ImportError) as e:
        if "numpy.dtype size changed" in str(e) or "numpy.strings" in str(e) or "numpy" in str(e).lower():
            fix_numpy_compatibility()
            raise SystemExit("Please restart runtime (Runtime > Restart runtime) and run the notebook again.")
        else:
            raise

else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

from utils import config

if is_google_colab():
    from google.colab import userdata
    import hopsworks
    project = hopsworks.login(
        api_key_value=userdata.get('HOPSWORKS_API_KEY'),
        engine="python"
    )
    AQICN_API_KEY = userdata.get('AQICN_API_KEY')
    
else:
    # Local development - use .env file
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


## 2.2. Imports

In [19]:
import datetime
import pandas as pd
import hopsworks
import json
import warnings
import requests
from utils import airquality

from dotenv import load_dotenv

warnings.filterwarnings("ignore")
load_dotenv()

True

## 2.3. Setup
Hopsworks and feature store setup - configure Hopsworks connection, retrieve API keys, and connect to existing air quality and weather feature groups.

In [20]:
if is_google_colab():
    fs = project.get_feature_store()
    secrets = hopsworks.get_secrets_api()
else:
    HOPSWORKS_API_KEY = getattr(settings, 'HOPSWORKS_API_KEY', None)

    if HOPSWORKS_API_KEY is not None and hasattr(HOPSWORKS_API_KEY, 'get_secret_value'):
        HOPSWORKS_API_KEY = HOPSWORKS_API_KEY.get_secret_value()

    project = hopsworks.login(engine="python", api_key_value=HOPSWORKS_API_KEY)

    fs = project.get_feature_store()

    secrets = hopsworks.get_secrets_api()
    AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value


today = datetime.date.today()

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)

weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

2025-12-04 15:09:30,577 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-12-04 15:09:30,582 INFO: Initializing external client
Connection closed.
2025-12-04 15:09:30,582 INFO: Initializing external client
2025-12-04 15:09:30,583 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-12-04 15:09:30,583 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-04 15:09:32,003 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


## 2.4. Sensor Mode
Set SENSOR_CSV_FILE in .env with the relative path to a sensor to process it, or leave it unset to process all sensors in the `data` folder.

Retrieve sensor location data from Hopsworks secret, parse JSON location metadata.

In [21]:
if is_google_colab():
    sensor_csv_file = None
else:
    sensor_csv_file = getattr(settings, 'SENSOR_CSV_FILE', None)

if sensor_csv_file:
    # Read one secret for single sensor mode
    _, _, _, _, _, sensor_id = airquality.read_sensor_data(sensor_csv_file)
    secret_name = f"SENSOR_LOCATION_JSON_{sensor_id}"
    location_str = secrets.get_secret(secret_name).value
    locations = {sensor_id: json.loads(location_str)}
else:
    # Read all individual secrets in batch mode
    all_secrets = secrets.get_secrets()
    locations = {}
    for secret in all_secrets:
        if secret.name.startswith("SENSOR_LOCATION_JSON_"):
            sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
            location_str = secrets.get_secret(secret.name).value
            if location_str:
                locations[sensor_id] = json.loads(location_str)


In [22]:
# Fix: Convert @ URLs to A URLs for Swedish sensors (AQICN API change)
fixed_count = 0
for sensor_id, location in locations.items():
    if "@" in location["aqicn_url"]:
        # Convert from @{id} to A{id} format
        old_url = location["aqicn_url"]
        new_url = old_url.replace("/@", "/A")
        location["aqicn_url"] = new_url
        fixed_count += 1

if fixed_count > 0:
    print(f"üîß Fixed {fixed_count} sensor URLs from @ to A format")
else:
    print("‚ÑπÔ∏è All sensor URLs already in correct format")

üîß Fixed 105 sensor URLs from @ to A format


## 2.5. Helper Methods
Data processing functions - get daily weather forecasts and fetch current data, air quality and weather, for each sensor location.

In [23]:
def get_daily_weather_forecast(city, latitude, longitude):
    hourly_df = airquality.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")
    daily_df = hourly_df.between_time("11:59", "12:01")
    daily_df = daily_df.reset_index()
    daily_df["date"] = pd.to_datetime(daily_df["date"]).dt.date
    daily_df["date"] = pd.to_datetime(daily_df["date"])
    daily_df["city"] = city
    return daily_df


def fetch_data_for_location(sensor, location):
    country = location["country"]
    city = location["city"]
    street = location["street"]
    aqicn_url = location["aqicn_url"]
    latitude = location["latitude"]
    longitude = location["longitude"]

    aq_today_df = airquality.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
    
    daily_df = get_daily_weather_forecast(city, latitude, longitude)
    return aq_today_df, daily_df

## 2.6. Data Collection
Loop through all sensors to fetch today's air quality data and weather forecasts, format data to match feature group schemas.

In [24]:
aqs = []
weathers = []
print(f"üîç Processing {len(locations)} sensor locations...")

for sensor, location in locations.items():
    try:
        print(f"üîç Processing sensor {sensor}...")
        aq_today_df, weather_daily_forecast_df = fetch_data_for_location(sensor, location)
        
        print(f"üîç Air quality data for sensor {sensor}: {len(aq_today_df)} rows")
        if len(aq_today_df) > 0 and sensor in ['121810', '59095']:  # Debug first couple sensors
            print(f"üîç DEBUG - PM2.5 value for sensor {sensor}: {aq_today_df['pm25'].iloc[0]}")
        print(f"üîç Weather data for sensor {sensor}: {len(weather_daily_forecast_df)} rows")
        
        # Air quality FG shape
        aq_today_df = aq_today_df.assign(
            sensor_id=str(sensor),
            street=location["street"],
            city=location["city"],
            country=location["country"],
            feed_url=location["aqicn_url"],
        )
        aq_today_df["date"] = pd.to_datetime(aq_today_df["date"])

        # Weather FG shape
        weather_daily_forecast_df = weather_daily_forecast_df.assign(
            sensor_id=str(sensor),
            city=location["city"],
            latitude=location["latitude"],
            longitude=location["longitude"],
        )
        weather_daily_forecast_df["date"] = pd.to_datetime(
            weather_daily_forecast_df["date"]
        )

        aqs.append(aq_today_df)
        weathers.append(weather_daily_forecast_df)
        print(f"‚úÖ Successfully processed sensor {sensor}")
        
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è  Skipping sensor {sensor}: {e}")
        continue
    except Exception as e:
        print(f"‚ö†Ô∏è  Unexpected error with sensor {sensor}: {type(e).__name__}: {e}")
        continue

üîç Processing 105 sensor locations...
üîç Processing sensor 121810...
üîç Air quality data for sensor 121810: 1 rows
üîç DEBUG - PM2.5 value for sensor 121810: 6.0
üîç Weather data for sensor 121810: 7 rows
‚úÖ Successfully processed sensor 121810
üîç Processing sensor 192520...
üîç Air quality data for sensor 121810: 1 rows
üîç DEBUG - PM2.5 value for sensor 121810: 6.0
üîç Weather data for sensor 121810: 7 rows
‚úÖ Successfully processed sensor 121810
üîç Processing sensor 192520...
üîç Air quality data for sensor 192520: 1 rows
üîç Weather data for sensor 192520: 7 rows
‚úÖ Successfully processed sensor 192520
üîç Processing sensor 196735...
üîç Air quality data for sensor 192520: 1 rows
üîç Weather data for sensor 192520: 7 rows
‚úÖ Successfully processed sensor 192520
üîç Processing sensor 196735...
üîç Air quality data for sensor 196735: 1 rows
üîç Weather data for sensor 196735: 7 rows
‚úÖ Successfully processed sensor 196735
üîç Processing sensor 208483...


In [25]:
aq_df = pd.concat(aqs) if aqs else pd.DataFrame()

if not aq_df.empty:
    aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
    aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
    aq_df = aq_df.drop(columns=["url"], errors="ignore")
    


# Get historical data for rolling window and lagged features
historical_start = today - datetime.timedelta(days=4)
historical_df = pd.DataFrame()

# Read historical data from feature group and filter for the last 4 days
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        historical_df = historical_df[
            (historical_df["date"].dt.date >= historical_start) & (historical_df["date"].dt.date < today)
        ][["date", "sensor_id", "pm25"]]
except Exception:
    pass

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (12.37s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (12.37s) 


In [26]:
combined_df = pd.concat([historical_df, aq_df], ignore_index=True) if not historical_df.empty else aq_df
if not combined_df.empty:
    combined_df = airquality.add_rolling_window_feature(combined_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
    combined_df = airquality.add_lagged_features(combined_df, column="pm25", lags=[1, 2, 3])
    combined_df = airquality.add_nearby_sensor_feature(combined_df, locations, column="pm25_lag_1d", n_closest=3)
    
    # Only keep today's data
    aq_df = combined_df[combined_df["date"].dt.date == today].copy()
else:
    aq_df = pd.DataFrame()
    print("‚ö†Ô∏è  No data available for processing")
aq_df

Unnamed: 0,date,sensor_id,pm25,country,city,street,feed_url,pm25_rolling_3d,pm25_lag_1d,pm25_lag_2d,pm25_lag_3d,pm25_nearby_avg
235,2025-12-04,105325,42.0,Sweden,√ñrnsk√∂ldsvik,H√∂rnettv√§gen,https://api.waqi.info/feed/A105325/,8.666667,6.00,18.00,2.00,1.300000
286,2025-12-04,107110,58.0,Sweden,Uppsala,Kuggebro,https://api.waqi.info/feed/A107110/,3.890000,7.75,2.20,1.72,5.576667
268,2025-12-04,112672,29.0,Sweden,Gothenburg,B√•gskyttegatan,https://api.waqi.info/feed/A112672/,2.000000,2.00,,,15.000000
269,2025-12-04,112993,17.0,Sweden,S√∂derby,Eker√∂v√§gen,https://api.waqi.info/feed/A112993/,2.250000,4.25,1.70,0.80,2.950000
287,2025-12-04,113539,19.0,Sweden,St√§ket,Aron Lindgrens v√§g,https://api.waqi.info/feed/A113539/,2.283333,4.15,1.70,1.00,3.300000
...,...,...,...,...,...,...,...,...,...,...,...,...
248,2025-12-04,88372,17.0,Sweden,Gothenburg,Ridl√§rargatan,https://api.waqi.info/feed/A88372/,21.000000,21.00,,,8.666667
233,2025-12-04,88876,31.0,Sweden,L√∂tk√§rr,Myggv√§gen,https://api.waqi.info/feed/A88876/,9.000000,9.00,,,5.333333
249,2025-12-04,89584,17.0,Sweden,T√§by kommun,Vallatorpsv√§gen,https://api.waqi.info/feed/A89584/,3.546667,5.75,2.97,1.92,8.000000
234,2025-12-04,90676,31.0,Sweden,Upph√§rad,Upph√§rad,https://api.waqi.info/feed/A90676/,6.266667,10.90,4.85,3.05,0.633333


In [27]:
weather_df = pd.concat(weathers) if weathers else pd.DataFrame()

if not weather_df.empty:
    weather_df["date"] = pd.to_datetime(weather_df["date"])
    
    # Convert to float32 to match Hopsworks feature group schema
    weather_df["temperature_2m_mean"] = weather_df["temperature_2m_mean"].astype("float32")
    weather_df["precipitation_sum"] = weather_df["precipitation_sum"].astype("float32")
    weather_df["wind_speed_10m_max"] = weather_df["wind_speed_10m_max"].astype("float32")
    weather_df["wind_direction_10m_dominant"] = weather_df["wind_direction_10m_dominant"].astype("float32")
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city,sensor_id,latitude,longitude
0,2025-12-04,2.7760,0.2,5.040000,89.000000,Acksj√∂n,121810,59.648101,13.752426
1,2025-12-05,2.7760,0.0,5.400000,93.000000,Acksj√∂n,121810,59.648101,13.752426
2,2025-12-06,3.2760,0.3,11.159999,61.000000,Acksj√∂n,121810,59.648101,13.752426
3,2025-12-07,2.6185,0.0,11.989945,97.765083,Acksj√∂n,121810,59.648101,13.752426
4,2025-12-08,4.7185,0.0,17.935081,100.407661,Acksj√∂n,121810,59.648101,13.752426
...,...,...,...,...,...,...,...,...,...
2,2025-12-06,4.7460,0.0,7.200000,165.000000,Hen√•n,92683,58.272055,11.688522
3,2025-12-07,4.2715,0.4,11.345713,91.818268,Hen√•n,92683,58.272055,11.688522
4,2025-12-08,6.2215,1.3,19.902824,114.015106,Hen√•n,92683,58.272055,11.688522
5,2025-12-09,4.6215,0.0,6.048107,143.471054,Hen√•n,92683,58.272055,11.688522


In [28]:
# Final validation before inserting to feature store
if not aq_df.empty and not weather_df.empty:
    print(f"‚úÖ Inserting {len(aq_df)} air quality rows and {len(weather_df)} weather rows to feature store")
    air_quality_fg.insert(aq_df)
    weather_fg.insert(weather_df)
    print("üìÅ Data successfully inserted to feature store")
else:
    if aq_df.empty:
        print("‚ö†Ô∏è  No clean air quality data to insert")
    if weather_df.empty:
        print("‚ö†Ô∏è  No clean weather data to insert")

‚úÖ Inserting 104 air quality rows and 728 weather rows to feature store
2025-12-04 15:12:29,299 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 104/104 | Elapsed Time: 00:01 | Remaining Time: 00:00



Launching job: air_quality_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-12-04 15:12:43,411 INFO: 	2 expectation(s) included in expectation_suite.
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-12-04 15:12:43,411 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 728/728 | Elapsed Time: 00:01 | Remaining Time: 00:00



Launching job: weather_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions
üìÅ Data successfully inserted to feature store
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions
üìÅ Data successfully inserted to feature store
