# 3. Training Pipeline

## 3.1. Environment Setup
Detect if running in Google Colab or local environment, handle repository cloning, dependency installation, numpy compatibility fixes, and set up Python path.

In [None]:
import sys
from pathlib import Path
import hopsworks
import warnings

warnings.filterwarnings("ignore", module="IPython")

def clone_repository() -> None:
    repo_dir = Path("pm25-forecast-openmeteo-aqicn")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd pm25-forecast-openmeteo-aqicn
    else:
        print("Cloning repository...")
        !git clone https://github.com/KristinaPalmquist/pm25-forecast-openmeteo-aqicn.git
        %cd pm25-forecast-openmeteo-aqicn

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


root_dir = Path().absolute()
for folder in ("src", "airquality", "notebooks"):
    if root_dir.parts[-1:] == (folder,):
        root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir)

if root_dir not in sys.path:
    sys.path.append(root_dir)

from utils import config

settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
project = hopsworks.login(engine="python", api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

## 3.2. Imports

In [None]:
from datetime import datetime
import pandas as pd
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import hopsworks
from utils import airquality
from collections import defaultdict
import json
import warnings
import os

warnings.filterwarnings("ignore")

## 3.3. Hopsworks configuration
Configure Hopsworks connection, retrieve API keys, connect to feature store, and get air quality and weather feature groups.

In [None]:
HOPSWORKS_API_KEY = getattr(settings, 'HOPSWORKS_API_KEY', None)

if HOPSWORKS_API_KEY is not None and hasattr(HOPSWORKS_API_KEY, 'get_secret_value'):
    HOPSWORKS_API_KEY = HOPSWORKS_API_KEY.get_secret_value()

project = hopsworks.login(engine="python", api_key_value=HOPSWORKS_API_KEY)

fs = project.get_feature_store()

secrets = hopsworks.get_secrets_api()
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value


today = datetime.today().date()

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)

weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

## 3.4. Sensor Location Loading
Load sensor location metadata from Hopsworks secrets for all sensors.

In [None]:
# Load all sensor locations from Hopsworks secrets
all_secrets = secrets.get_secrets()
locations = {}
for secret in all_secrets:
    if secret.name.startswith("SENSOR_LOCATION_JSON_"):
        sensor_id = secret.name.replace("SENSOR_LOCATION_JSON_", "")
        location_str = secrets.get_secret(secret.name).value
        if location_str:
            locations[sensor_id] = json.loads(location_str)

## 3.5. Feature View Creation
Create multiple feature views with different feature combinations (baseline, rolling windows, lagged features, nearby sensors, complete) for model comparison.

In [None]:
# Select features for training data
baseline_features = air_quality_fg.select(["pm25", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)

baseline_feature_view = fs.get_or_create_feature_view(
    name="air_quality_baseline_fv",
    description="Weather features for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=baseline_features,
)

rolling_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
rolling_feature_view = fs.get_or_create_feature_view(
    name="air_quality_rolling_fv",
    description="Weather features, PM2.5 rolling window (3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=rolling_features,
)

nearby_features = air_quality_fg.select(["pm25", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
nearby_feature_view = fs.get_or_create_feature_view(
    name="air_quality_nearby_fv",
    description="Weather features, PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=nearby_features,
)

lagged_1d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
lagged_1d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_1d_fv",
    description="Weather features, PM2.5 lags (1d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_1d_features,
)

lagged_2d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
lagged_2d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_2d_fv",
    description="Weather features, PM2.5 lags (1d, 2d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_2d_features,
)

lagged_3d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
lagged_3d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_3d_fv",
    description="Weather features, PM2.5 lags (1d, 2d, 3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_3d_features,
)

complete_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id"]
)
complete_feature_view = fs.get_or_create_feature_view(
    name="air_quality_complete_fv",
    description="Weather features, PM2.5 rolling window (3d), and PM2.5 lags (1d, 2d, 3d), and PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=complete_features,
)

## 3.6. Model Training Setup
Set up test data split date, initialize containers for models and predictions, and define feature view dictionary for iteration

In [None]:
# start_date_test_data = "2025-04-01"
# test_start = datetime.strptime(start_date_test_data, "%Y-%m-%d")

models = defaultdict(dict)
y_preds = defaultdict(dict)
results = []

feature_views = {
    "baseline": baseline_feature_view,
    "rolling": rolling_feature_view,
    "nearby": nearby_feature_view,
    "lagged_1d": lagged_1d_feature_view,
    "lagged_2d": lagged_2d_feature_view,
    "lagged_3d": lagged_3d_feature_view,
    "complete": complete_feature_view,
}

## 3.7. Model Training Loop
Train XGBoost models for each feature combination and sensor, run 5 iterations per configuration, select best model based on R2 score, and store results.

In [None]:
for feature_name, feature_view in feature_views.items():
    data = feature_view.query.read()
    data['date'] = pd.to_datetime(data['date']).dt.tz_localize(None)

    for sensor_id in locations.keys():
        df = data[data['sensor_id'] == sensor_id].copy()
        
        # Clean the data before splitting
        df = df.dropna(subset=['pm25'])
        features_for_cleaning = df.drop(columns=["pm25", "date", "city", "sensor_id"])
        target_for_cleaning = df["pm25"]
        clean_mask = ~(features_for_cleaning.isna().any(axis=1) | target_for_cleaning.isna())
        df_clean = df[clean_mask].copy()
        if len(df_clean) < 10:
            print(f"⚠️  Skipping sensor {sensor_id}: insufficient data after cleaning ({len(df_clean)} rows)")
            continue
            
        
        train_size = int(0.8 * len(df_clean))
        train_df = df_clean.iloc[:train_size]
        test_df = df_clean.iloc[train_size:]
        
        if len(test_df) < 2:
            print(f"⚠️  Skipping sensor {sensor_id}: test set too small after split ({len(test_df)} rows)")
            continue

        # Drop non-feature columns (pm25 is target, others are metadata)
        X_train = train_df.drop(columns=["pm25", "date", "city", "sensor_id"])
        y_train = train_df["pm25"]
        X_test = test_df.drop(columns=["pm25", "date", "city", "sensor_id"])
        y_test = test_df["pm25"]
        
        # run three times and take the best model, save the average of the three
        best_r2 = -float('inf')
        best_mse = float('inf')
        best_model = None
        
        mse_list = []
        r2_list = []
        for i in range(5):
            model = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=165439*i)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2_list.append(r2)
            mse_list.append(mse)
            if r2 > best_r2:
                best_r2 = r2
                best_mse = mse
                best_model = model

        models[feature_name][sensor_id] = best_model
        
        if best_model is not None:
            y_preds[feature_name][sensor_id] = best_model.predict(X_test)
            
            results.append({
                "feature_name": feature_name,
                "sensor_id": sensor_id,
                "MSE": sum(mse_list) / len(mse_list),
                "R2": sum(r2_list) / len(r2_list),
                "train_size": len(X_train),
                "test_size": len(X_test)
            })
        else:
            print(f"⚠️  No valid model trained for {feature_name} - {sensor_id}, R2 scores: {r2_list}, Best R2: {best_r2}")

## 3.8. Create Model Directory
To store trained models

In [None]:
model_dir = f"{root_dir}/models"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

## 3.9. Model Selection and Saving
Identify best performing model for each sensor, save models and feature importance plots, and prepare test data with predictions.

In [None]:
# Find best model (highest R2) for each sensor
results_df = pd.DataFrame(results)
best_models = results_df.loc[results_df.groupby('sensor_id')['R2'].idxmax()]
# print("Best models per sensor:")
# print(best_models[['sensor_id', 'feature_name', 'R2', 'MSE']])

all_data = baseline_features.read()
all_data['date'] = pd.to_datetime(all_data['date']).dt.tz_localize(None)

all_test_data = []
for _, row in best_models.iterrows():
    sensor_id = row['sensor_id']
    best_feature = row['feature_name']
    
    sensor_dir = f"{model_dir}/{sensor_id}"
    if not os.path.exists(sensor_dir):
        os.mkdir(sensor_dir)
    images_dir = f"{model_dir}/{sensor_id}/images"
    if not os.path.exists(images_dir):
        os.mkdir(images_dir)

    best_model = models[best_feature][sensor_id]
    model_path = f"{sensor_dir}/model.json"
    plot_importance(best_model)
    importance_path = f"{images_dir}/feature_importance.png"
    plt.savefig(importance_path)
    plt.close()
    
    best_model.save_model(model_path)

    # Use the same feature view and data processing logic that was used for training
    best_feature_view = feature_views[best_feature]
    sensor_data = best_feature_view.query.read()
    sensor_data['date'] = pd.to_datetime(sensor_data['date']).dt.tz_localize(None)
    
    df = sensor_data[sensor_data['sensor_id'] == sensor_id].copy()
    
    # Apply EXACT same cleaning logic as in training loop
    df = df.dropna(subset=['pm25'])
    
    # Create feature matrix for comprehensive NaN cleaning (same as training)
    features_for_cleaning = df.drop(columns=["pm25", "date", "city", "sensor_id"])
    target_for_cleaning = df["pm25"]
    
    # Remove rows with NaN values in any feature or target (same as training)
    clean_mask = ~(features_for_cleaning.isna().any(axis=1) | target_for_cleaning.isna())
    df_clean = df[clean_mask].copy()
    
    if len(df_clean) < 10:
        continue
        
    # Split the cleaned data (same as training)
    train_size = int(0.8 * len(df_clean))
    test_df = df_clean.iloc[train_size:].copy()
    
    if len(test_df) == 0:
        continue
    
    # Test data is already clean from the comprehensive cleaning above
    clean_test_df = test_df.copy()
    predictions = y_preds[best_feature][sensor_id]
    
    if len(clean_test_df) == len(predictions):
        clean_test_df['predicted_pm25'] = predictions
        clean_test_df['best_model'] = best_feature
        all_test_data.append(clean_test_df[['date', 'pm25', 'predicted_pm25', 'latitude', 'longitude', 'best_model']])
    else:
        print(f"⚠️  Skipping sensor {sensor_id}: prediction length mismatch ({len(predictions)} vs {len(clean_test_df)})")


## 3.10. Model Registration & Visualization
Create prediction plots for each sensor, register model in Hopsworks model registry with metrics and metadata, and save models with their configuration.

In [None]:
mr = project.get_model_registry()
df = pd.concat(all_test_data, ignore_index=True) if all_test_data else pd.DataFrame()
df = df.sort_values(by=["date"])

# Plot the best model for each sensor
for sensor_id, location in locations.items():
    city = location["city"]
    street = location["street"]
    latitude = location["latitude"]
    longitude = location["longitude"]
    
    df_subset = df[(df["latitude"] == latitude) & (df["longitude"] == longitude)].copy()
    if len(df_subset) == 0:
        continue
    
    # Get the best model name for display
    best_model_name = df_subset['best_model'].iloc[0] if 'best_model' in df_subset.columns else 'unknown'
    best_model_r2 = df_subset['R2'].iloc[0] if 'R2' in df_subset.columns else 0
    best_model_mse = df_subset['MSE'].iloc[0] if 'MSE' in df_subset.columns else 0
    best_model_feature_view = feature_views[best_model_name]
    
    df_subset = df_subset.sort_values(by=["date"])
    df_subset = df_subset.drop(columns=["latitude", "longitude", "best_model"])
    
    images_dir = f"{model_dir}/{sensor_id}/images"
    image_path = f"{images_dir}/hindcast_training.png"
    
    plt = airquality.plot_air_quality_forecast(
        city, street, df_subset, image_path, hindcast=True
    )
    plt.title(f"{city} {street} (Best Model: {best_model_name})")

    aq_model = mr.python.create_model(
        name=f"air_quality_xgboost_model_{sensor_id}",
        metrics={
            "R2": best_model_r2,
            "MSE": best_model_mse,
        },
        feature_view=best_model_feature_view,
        description=f"Air Quality (PM2.5) predictor for {city} {street} using {best_model_name} configuration",
    )

    aq_model.save(f"{model_dir}/{sensor_id}")
