# 2. Feature Pipeline

## 2.1. Environment Setup
Detect if running in Google Colab or local environment, handle repository cloning, dependency installation, numpy compatibility fixes, and set up Python path

In [1]:
# import shutil
# import os
# if os.path.exists('/content/pm25-forecast-openmeteo-aqicn'):
#     shutil.rmtree('/content/pm25-forecast-openmeteo-aqicn')

In [2]:
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    try:
        if "google.colab" in str(get_ipython()):
            return True
    except:
        pass
    return False

def clone_repository() -> None:
    # Check if repository already exists
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

def fix_numpy_compatibility():
    print("Fixing numpy compatibility for hopsworks/pandas...")
    try:
        # Use compatible versions that work with the installed packages
        !pip install --force-reinstall numpy==1.26.4 pandas==2.0.3
        print("Numpy and pandas fixed. Please restart runtime and run again.")
    except Exception as e:
        print(f"Fix attempt failed: {e}")
        print("Please manually restart runtime and try again.")

if is_google_colab():
    try:
        import numpy
        numpy.array([1, 2, 3])
        import pandas as pd
        print("Basic packages working correctly")

        clone_repository()
        install_dependencies()

        import hopsworks
        print("All packages working correctly")

        root_dir = str(Path().absolute())
        print("Google Colab environment")
        
    except (ValueError, ImportError) as e:
        if "numpy.dtype size changed" in str(e) or "numpy.strings" in str(e) or "numpy" in str(e).lower():
            fix_numpy_compatibility()
            raise SystemExit("Please restart runtime (Runtime > Restart runtime) and run the notebook again.")
        else:
            raise

else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

if is_google_colab():
    from google.colab import userdata
    import hopsworks
    project = hopsworks.login(
        api_key_value=userdata.get('HOPSWORKS_API_KEY'),
        engine="python"
    )
    AQICN_API_KEY = userdata.get('AQICN_API_KEY')
    
else:
    # Local development - use .env file
    from utils import config
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
Added the following directory to the PYTHONPATH: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


## 2.2. Imports

In [3]:
import datetime
import pandas as pd
import hopsworks
import json
import warnings
import requests
from utils import airquality
from dotenv import load_dotenv

warnings.filterwarnings("ignore")
load_dotenv()

True

## 2.3. Setup
Hopsworks and feature store setup - configure Hopsworks connection, retrieve API keys, and connect to existing air quality and weather feature groups.

In [4]:
if is_google_colab():
    fs = project.get_feature_store()
    secrets = hopsworks.get_secrets_api()
else:
    HOPSWORKS_API_KEY = getattr(settings, 'HOPSWORKS_API_KEY', None)

    if HOPSWORKS_API_KEY is not None and hasattr(HOPSWORKS_API_KEY, 'get_secret_value'):
        HOPSWORKS_API_KEY = HOPSWORKS_API_KEY.get_secret_value()

    project = hopsworks.login(engine="python", api_key_value=HOPSWORKS_API_KEY)

    fs = project.get_feature_store()

    secrets = hopsworks.get_secrets_api()
    AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value

today = datetime.date.today()

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)

weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

2025-12-12 10:01:07,652 INFO: Initializing external client
2025-12-12 10:01:07,655 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-12 10:01:09,798 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


## 2.4. Sensor Location Loading
Retrieve sensor location data from Hopsworks secrets for all sensors and parse JSON location metadata.

In [5]:
# Read all individual secrets for all sensors
all_secrets = secrets.get_secrets()
locations = {}
for secret in all_secrets:
    if secret.name.startswith("SENSOR_LOCATION_JSON_"):
        sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
        location_str = secrets.get_secret(secret.name).value
        if location_str:
            locations[sensor_id] = json.loads(location_str)

In [6]:
# Convert @ URLs to A URLs for Swedish sensors (AQICN API change)
fixed_count = 0
for sensor_id, location in locations.items():
    if "@" in location["aqicn_url"]:
        old_url = location["aqicn_url"]
        new_url = old_url.replace("/@", "/A")
        location["aqicn_url"] = new_url
        fixed_count += 1

if fixed_count > 0:
    print(f"üîß Fixed {fixed_count} sensor URLs from @ to A format")
else:
    print("‚ÑπÔ∏è All sensor URLs already in correct format")

üîß Fixed 105 sensor URLs from @ to A format


## 2.5. Helper Methods
Data processing functions - get daily weather forecasts and fetch current data, air quality and weather, for each sensor location.

In [7]:
def get_daily_weather_forecast(city, latitude, longitude):
    hourly_df = airquality.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")
    daily_df = hourly_df.between_time("11:59", "12:01")
    daily_df = daily_df.reset_index()
    # daily_df["date"] = pd.to_datetime(daily_df["date"]).dt.date
    daily_df["date"] = pd.to_datetime(daily_df["date"])
    daily_df["city"] = city
    return daily_df


def fetch_data_for_location(location):
    country = location["country"]
    city = location["city"]
    street = location["street"]
    aqicn_url = location["aqicn_url"]
    latitude = location["latitude"]
    longitude = location["longitude"]

    aq_today_df = airquality.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
    daily_df = get_daily_weather_forecast(city, latitude, longitude)
    return aq_today_df, daily_df

## 2.6. Data Collection
Loop through all sensors to fetch today's air quality data and weather forecasts, format data to match feature group schemas.

In [8]:
aqs = []
weathers = []
print(f"üîç Processing {len(locations)} sensor locations...")

for sensor, location in locations.items():
    try:
        aq_today_df, weather_daily_forecast_df = fetch_data_for_location(location)

        aq_today_df = aq_today_df.assign(
            sensor_id=str(sensor),
            street=location["street"],
            city=location["city"],
            country=location["country"],
            feed_url=location["aqicn_url"],
        )
        aq_today_df["date"] = pd.to_datetime(aq_today_df["date"])

        # Weather FG shape
        weather_daily_forecast_df = weather_daily_forecast_df.assign(
            sensor_id=str(sensor),
            city=location["city"],
            latitude=location["latitude"],
            longitude=location["longitude"],
        )
        weather_daily_forecast_df["date"] = pd.to_datetime(
            weather_daily_forecast_df["date"]
        )

        aqs.append(aq_today_df)
        weathers.append(weather_daily_forecast_df)
        
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è  Skipping sensor {sensor}: {e}")
        continue
    except Exception as e:
        print(f"‚ö†Ô∏è  Unexpected error with sensor {sensor}: {type(e).__name__}: {e}")
        continue

üîç Processing 105 sensor locations...
Error: There may be an incorrect URL for your Sensor or it is not contactable right now. The API response does not contain data.  Error message: no such station
‚ö†Ô∏è  Skipping sensor 362923: no such station
Error: There may be an incorrect URL for your Sensor or it is not contactable right now. The API response does not contain data.  Error message: no such station
‚ö†Ô∏è  Skipping sensor 472264: no such station
‚ö†Ô∏è  Skipping sensor 474841: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


In [9]:
# print(len(weathers))
# print(weathers)

In [10]:
aq_df = pd.concat(aqs) if aqs else pd.DataFrame()
if not aq_df.empty:
    aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
    aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
    aq_df = aq_df.drop(columns=["url"], errors="ignore")

    # Data quality check 1: Remove rows with missing PM2.5 values
    initial_count = len(aq_df)
    aq_df = aq_df.dropna(subset=['pm25'])
    if len(aq_df) < initial_count:
        print(f"üßπ Removed {initial_count - len(aq_df)} rows with missing PM2.5 values")

# Get historical data for rolling window and lagged features
historical_start = today - datetime.timedelta(days=4)
historical_df = pd.DataFrame()

# Read historical data from feature group and filter for the last 4 days
try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        historical_df = historical_df[
            (historical_df["date"].dt.date >= historical_start) & (historical_df["date"].dt.date < today)
        ][["date", "sensor_id", "pm25"]]
except Exception:
    pass

2025-12-12 10:09:17,974 ERROR: Flight returned timeout error, with message: Deadline Exceeded
Traceback (most recent call last):
  File "c:\Users\krist\AppData\Local\Programs\Python\Python311\Lib\site-packages\hsfs\core\arrow_flight_client.py", line 209, in __init__
    self._health_check()
  File "c:\Users\krist\AppData\Local\Programs\Python\Python311\Lib\site-packages\retrying.py", line 55, in wrapped_f
    return Retrying(*dargs, **dkw).call(f, *args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\krist\AppData\Local\Programs\Python\Python311\Lib\site-packages\retrying.py", line 289, in call
    raise attempt.get()
          ^^^^^^^^^^^^^
  File "c:\Users\krist\AppData\Local\Programs\Python\Python311\Lib\site-packages\retrying.py", line 326, in get
    raise exc.with_traceback(tb)
  File "c:\Users\krist\AppData\Local\Programs\Python\Python311\Lib\site-packages\retrying.py", line 273, in call
    attempt = Attempt(fn(*args, **kwargs), attempt_number, F

In [11]:
combined_df = pd.concat([historical_df, aq_df], ignore_index=True) if not historical_df.empty else aq_df
if not combined_df.empty:
    combined_df = airquality.add_rolling_window_feature(combined_df, window_days=3, column="pm25", new_column="pm25_rolling_3d")
    combined_df = airquality.add_lagged_features(combined_df, column="pm25", lags=[1, 2, 3])
    combined_df = airquality.add_nearby_sensor_feature(combined_df, locations, column="pm25_lag_1d", n_closest=3)
    
    # Data quality check 2: Clean up NaNs created by feature engineering
    before_cleaning = len(combined_df[combined_df["date"].dt.date == today])
    
    # Only keep today's data and remove rows where essential features are NaN
    aq_df = combined_df[combined_df["date"].dt.date == today].copy()
    
    # Remove rows where pm25 is still NaN after all processing
    aq_df = aq_df.dropna(subset=['pm25'])
    
    after_cleaning = len(aq_df)
    if before_cleaning > after_cleaning:
        print(f"üßπ Removed {before_cleaning - after_cleaning} rows with NaN values after feature engineering")
    
    print(f"üìä Final data quality: {len(aq_df)} clean rows ready for feature store")
else:
    aq_df = pd.DataFrame()
    print("‚ö†Ô∏è  No data available for processing")
aq_df.head()

üìä Final data quality: 102 clean rows ready for feature store


Unnamed: 0,pm25,country,city,street,date,sensor_id,feed_url,pm25_rolling_3d,pm25_lag_1d,pm25_lag_2d,pm25_lag_3d,pm25_nearby_avg
0,4.0,Sweden,√ñrnsk√∂ldsvik,H√∂rnettv√§gen,2025-12-12,105325,https://api.waqi.info/feed/A105325/,,,,,
0,2.0,Sweden,Uppsala,Kuggebro,2025-12-12,107110,https://api.waqi.info/feed/A107110/,,,,,
0,22.0,Sweden,Gothenburg,B√•gskyttegatan,2025-12-12,112672,https://api.waqi.info/feed/A112672/,,,,,
0,2.0,Sweden,S√∂derby,Eker√∂v√§gen,2025-12-12,112993,https://api.waqi.info/feed/A112993/,,,,,
0,2.0,Sweden,St√§ket,Aron Lindgrens v√§g,2025-12-12,113539,https://api.waqi.info/feed/A113539/,,,,,


In [12]:
weather_df = pd.concat(weathers) if weathers else pd.DataFrame()
if not weather_df.empty:
    weather_df["date"] = pd.to_datetime(weather_df["date"])
    
    # Data quality check 3: Remove rows with missing weather data
    initial_weather_count = len(weather_df)
    weather_df = weather_df.dropna(subset=['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max'])
    
    # Convert to float32 to match Hopsworks feature group schema
    weather_df["temperature_2m_mean"] = weather_df["temperature_2m_mean"].astype("float32")
    weather_df["precipitation_sum"] = weather_df["precipitation_sum"].astype("float32")
    weather_df["wind_speed_10m_max"] = weather_df["wind_speed_10m_max"].astype("float32")
    weather_df["wind_direction_10m_dominant"] = weather_df["wind_direction_10m_dominant"].astype("float32")
    
    if len(weather_df) < initial_weather_count:
        print(f"üßπ Removed {initial_weather_count - len(weather_df)} rows with missing weather data")
    
    print(f"üå§Ô∏è  Weather data quality: {len(weather_df)} clean weather rows")
else:
    print("‚ö†Ô∏è  No weather data available")
weather_df.head()

üå§Ô∏è  Weather data quality: 714 clean weather rows


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city,sensor_id,latitude,longitude
0,2025-12-12 12:00:00,-5.9195,0.0,7.92,44.0,√ñrnsk√∂ldsvik,105325,63.274,18.684
1,2025-12-13 12:00:00,2.4305,1.1,23.039999,176.0,√ñrnsk√∂ldsvik,105325,63.274,18.684
2,2025-12-14 12:00:00,3.3805,0.5,23.039999,189.0,√ñrnsk√∂ldsvik,105325,63.274,18.684
3,2025-12-15 12:00:00,3.3,0.0,13.170786,176.081833,√ñrnsk√∂ldsvik,105325,63.274,18.684
4,2025-12-16 12:00:00,1.65,0.0,2.545584,81.869987,√ñrnsk√∂ldsvik,105325,63.274,18.684


In [13]:
# Final validation before inserting to feature store
if not aq_df.empty and not weather_df.empty:
    print(f"‚úÖ Inserting {len(aq_df)} air quality rows and {len(weather_df)} weather rows to feature store")
    air_quality_fg.insert(aq_df)
    weather_fg.insert(weather_df)
    print("üìÅ Data successfully inserted to feature store")
else:
    if aq_df.empty:
        print("‚ö†Ô∏è  No clean air quality data to insert")
    if weather_df.empty:
        print("‚ö†Ô∏è  No clean weather data to insert")

‚úÖ Inserting 102 air quality rows and 714 weather rows to feature store
2025-12-12 10:09:25,376 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1774972


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 102/102 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/air_quality_all_1_offline_fg_materialization/executions
2025-12-12 10:09:39,899 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1783130


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 714/714 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_all_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279184/jobs/named/weather_all_1_offline_fg_materialization/executions
üìÅ Data successfully inserted to feature store
