# 3. Training Pipeline

## 3.1. Setup

### 3.1.1. Import Libraries and initialize Hopsworks Connection

In [None]:
# Standard imports
import os
import sys
import json
import time
from datetime import date, datetime, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

#  Load settings 
settings = config.HopsworksSettings()
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

### 3.1.2. Repository Management

In [None]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

### 3.1.3. Configure API Keys and Secrets

In [None]:
today = datetime.today().date()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

In [None]:
# secrets = hopsworks.get_secrets_api()
# AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value

# # Retrieve feature groups
# air_quality_fg = fs.get_feature_group(
#     name="air_quality",
#     version=1,
# )

# weather_fg = fs.get_feature_group(
#     name="weather",
#     version=1,
# )

## 3.2. Get Feature Groups

In [None]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 3.3. Load Sensor Locations from Feature Group

In [None]:
# Load data from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

# Build sensor location dictionary: sensor_id -> (lat, lon, city, street, country, aqicn_url)
sensor_locations = {}
existing_aq_data = air_quality_fg.read()
existing_sensors = set(existing_aq_data["sensor_id"].unique())
print(f"üìã Found {len(existing_sensors)} sensors in feature store")

# Build location dict
for _, row in existing_aq_data[["sensor_id", "latitude", "longitude", "city", "street", "country"]].drop_duplicates(subset=["sensor_id"]).iterrows():
    sensor_locations[row["sensor_id"]] = (
        row["latitude"], 
        row["longitude"], 
        row["city"], 
        row["street"], 
        row["country"]
    )
print(f"üìç Loaded locations for {len(sensor_locations)} existing sensors")

## 3.4. Create Additional Feature Views
Create multiple feature views with different feature combinations (baseline, rolling windows, lagged features, nearby sensors, complete) for model comparison.

In [None]:
# Select features for training data
baseline_features = air_quality_fg.select(["pm25", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])

baseline_feature_view = fs.get_or_create_feature_view(
    name="air_quality_baseline_fv",
    description="Weather features for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=baseline_features,
)

rolling_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
rolling_feature_view = fs.get_or_create_feature_view(
    name="air_quality_rolling_fv",
    description="Weather features, PM2.5 rolling window (3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=rolling_features,
)

nearby_features = air_quality_fg.select(["pm25", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
nearby_feature_view = fs.get_or_create_feature_view(
    name="air_quality_nearby_fv",
    description="Weather features, PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=nearby_features,
)

lagged_1d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_1d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_1d_fv",
    description="Weather features, PM2.5 lags (1d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_1d_features,
)

lagged_2d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_2d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_2d_fv",
    description="Weather features, PM2.5 lags (1d, 2d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_2d_features,
)

lagged_3d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_3d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_3d_fv",
    description="Weather features, PM2.5 lags (1d, 2d, 3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_3d_features,
)

complete_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
complete_feature_view = fs.get_or_create_feature_view(
    name="air_quality_complete_fv",
    description="Weather features, PM2.5 rolling window (3d), and PM2.5 lags (1d, 2d, 3d), and PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=complete_features,
)

## 3.5. Model Training Setup
Set up test data split date, initialize containers for models and predictions, and define feature view dictionary for iteration

In [None]:
models = defaultdict(dict)
y_preds = defaultdict(dict)
results = []

feature_views = {
    "baseline": baseline_feature_view,
    "rolling": rolling_feature_view,
    "nearby": nearby_feature_view,
    "lagged_1d": lagged_1d_feature_view,
    "lagged_2d": lagged_2d_feature_view,
    "lagged_3d": lagged_3d_feature_view,
    "complete": complete_feature_view,
}

## 3.7. Model Training Loop
Train XGBoost models for each feature combination and sensor, run 5 iterations per configuration, select best model based on R2 score, and store results.

In [None]:
# Store candidates in Hopsworks as artifacts, not models (for reproducibility or auditing), 
# store models as artifacts in the Feature Store or in a Dataset, not the Model Registry.

model_dir = f"{root_dir}/models"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
print(f"Model directory: {model_dir}")

In [None]:
# calculate the mean for weather features for each day and sensor_id and replace in fg
weather_daily_mean = (
    weather_fg.read()
    .groupby(["sensor_id", "date"])
    .mean()
    .reset_index()
)

## 3.8. Debug data quality
To store trained models

In [None]:
# #  DEBUG:
# print("üîç DEBUGGING DATA QUALITY")
# print("="*60)

# # Check one sensor's data
# first_feature_name = list(feature_views.keys())[0]
# sample_data = feature_views[first_feature_name].query.read()
# test_sensor_id = sample_data["sensor_id"].iloc[0]

# print(f"\nüìä Checking sensor {test_sensor_id}:")

# for feature_name, feature_view in feature_views.items():
#     data = feature_view.query.read()
#     sensor_data = data[data["sensor_id"] == test_sensor_id].copy()
    
#     if len(sensor_data) == 0:
#         print(f"  {feature_name}: NO DATA")
#         continue
    
#     print(f"\n  {feature_name}: {len(sensor_data)} rows")
#     print(f"    Date range: {sensor_data['date'].min()} to {sensor_data['date'].max()}")
    
#     # Check each column for NaN
#     for col in sensor_data.columns:
#         nan_count = sensor_data[col].isna().sum()
#         nan_pct = (nan_count / len(sensor_data)) * 100
#         if nan_pct > 0:
#             print(f"    {col}: {nan_count}/{len(sensor_data)} NaN ({nan_pct:.1f}%)")
    
#     # Show first few rows
#     print(f"\n    Sample data:")
#     print(sensor_data.head())
#     break  # Only check baseline for now

# print("\n" + "="*60)

Training loop

In [None]:
# print(feature_name, data.shape)
# print(data["sensor_id"].unique()[:10])

In [None]:
# print("=== FEATURE VIEW CHECK ===")
# for feature_name, feature_view in feature_views.items():
#     data = feature_view.query.read()
#     print(feature_name, data.shape)
#     print("unique sensors:", len(data["sensor_id"].unique()))
#     print("sample sensors:", data["sensor_id"].unique()[:5])
#     print()

In [None]:
# for feature_name, feature_view in feature_views.items():
#     data = feature_view.query.read()
#     data['date'] = pd.to_datetime(data['date']).dt.tz_localize(None)

#     for sensor_id in data["sensor_id"].unique():
#         df = data[data["sensor_id"] == sensor_id].copy()
#         print(df.isna().sum())

In [None]:
models = {name: {} for name in feature_views.keys()}
y_preds = {name: {} for name in feature_views.keys()}
results = []

### Load Feature Views

In [None]:
feature_data_cache = {}

for feature_name, feature_view in feature_views.items():
    print("Reading", feature_name)
    df = feature_view.query.read()
    df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)
    feature_data_cache[feature_name] = df

joblib.dump(feature_data_cache, "feature_data_cache.pkl")
print("Saved feature data cache.")

### Prepare a List of All Tasks

In [None]:
tasks = []

for feature_name, df in feature_data_cache.items():
    for sensor_id in df["sensor_id"].unique():
        tasks.append((feature_name, sensor_id))

joblib.dump(tasks, "tasks.pkl")
print("Prepared task list:", len(tasks), "tasks")

### Load Checkpoints is they exist

In [None]:
if os.path.exists("models.pkl"):
    models = joblib.load("models.pkl")
else:
    models = {}

if os.path.exists("results.pkl"):
    results = joblib.load("results.pkl")
else:
    results = []

if os.path.exists("preds.pkl"):
    y_preds = joblib.load("preds.pkl")
else:
    y_preds = {}

### Training Loop

In [None]:
for feature_name, sensor_id in tasks:
    print(feature_name, sensor_id)
    
    # Skip if already processed
    if feature_name in models and sensor_id in models[feature_name]:
        print("Skipping", feature_name, sensor_id)
        continue

    df = feature_data_cache[feature_name]
    df = df[df["sensor_id"] == sensor_id].dropna(subset=["pm25"])

    if len(df) < 10:
        continue

    feature_cols = [c for c in df.columns if c not in [
        "pm25","date","sensor_id","city","street","country",
        "latitude","longitude","aqicn_url"
    ]]

    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    if len(test_df) < 2:
        continue

    X_train = train_df[feature_cols]
    y_train = train_df["pm25"]
    X_test = test_df[feature_cols]
    y_test = test_df["pm25"]

    best_model = None
    best_r2 = -1e9
    best_mse = 1e9

    for i in range(5):
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.05,
            random_state=165439*i
        )
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        r2 = r2_score(y_test, pred)
        mse = mean_squared_error(y_test, pred)

        if r2 > best_r2:
            best_r2 = r2
            best_mse = mse
            best_model = model

    # Save results in memory
    models.setdefault(feature_name, {})[sensor_id] = best_model
    y_preds.setdefault(feature_name, {})[sensor_id] = best_model.predict(X_test)
    results.append({
        "feature_name": feature_name,
        "sensor_id": sensor_id,
        "R2": best_r2,
        "MSE": best_mse,
        "train_size": len(train_df),
        "test_size": len(test_df),
    })

    # Save checkpoint after each task
    joblib.dump(models, "models.pkl")
    joblib.dump(results, "results.pkl")
    joblib.dump(y_preds, "preds.pkl")

    print("Saved", feature_name, sensor_id)

In [None]:
# for feature_name, feature_view in feature_views.items():
#     data = feature_view.query.read()
#     data['date'] = pd.to_datetime(data['date']).dt.tz_localize(None)

#     for sensor_id in data["sensor_id"].unique():
#         df = data[data["sensor_id"] == sensor_id].copy()

#         # Drop rows where pm25 is NaN
#         df = df.dropna(subset=["pm25"])
        
#         if len(df) < 10:
#             print(f"‚ö†Ô∏è  Skipping sensor {sensor_id} ({feature_name}): insufficient data ({len(df)} rows)")
#             continue

#         # Identify which features this feature view actually uses
#         # Drop non-feature columns first
#         feature_cols = [c for c in df.columns if c not in ["pm25", "date", "sensor_id", "city", "street", "country", "latitude", "longitude", "aqicn_url"]]
        
#         # Keep rows with NaN in features - XGBoost handles NaN natively
#         df_clean = df.copy()
        
#         if len(df_clean) < 10:
#             print(f"‚ö†Ô∏è  Skipping sensor {sensor_id} ({feature_name}): insufficient clean data ({len(df_clean)} rows, needed features: {feature_cols})")
#             continue

#         # Split
#         train_size = int(0.8 * len(df_clean))
#         train_df = df_clean.iloc[:train_size]
#         test_df = df_clean.iloc[train_size:]

#         if len(test_df) < 2:
#             print(f"‚ö†Ô∏è  Skipping sensor {sensor_id} ({feature_name}): test set too small ({len(test_df)} rows)")
#             continue

#         # Prepare features and target
#         X_train = train_df[feature_cols]
#         y_train = train_df["pm25"]
#         X_test = test_df[feature_cols]
#         y_test = test_df["pm25"]

#         best_r2 = -float('inf')
#         best_mse = float('inf')
#         best_model = None

#         mse_list = []
#         r2_list = []
        
#         for i in range(5):
#             model = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=165439*i)
#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)
#             r2 = r2_score(y_test, y_pred)
#             mse = mean_squared_error(y_test, y_pred)
#             r2_list.append(r2)
#             mse_list.append(mse)
            
#             if r2 > best_r2:
#                 best_r2 = r2
#                 best_mse = mse
#                 best_model = model

#         models[feature_name][sensor_id] = best_model

#         if best_model is not None:
#             y_preds[feature_name][sensor_id] = best_model.predict(X_test)
#             results.append({
#                 "feature_name": feature_name,
#                 "sensor_id": sensor_id,
#                 "MSE": sum(mse_list) / len(mse_list),
#                 "R2": sum(r2_list) / len(r2_list),
#                 "train_size": len(X_train),
#                 "test_size": len(X_test),
#             })
#             print(f"‚úÖ {feature_name} - Sensor {sensor_id}: R2={best_r2:.3f}, MSE={best_mse:.2f}")
#         else:
#             print(f"‚ö†Ô∏è  No valid model for {feature_name} - {sensor_id}")

In [None]:
# for feature_name, feature_view in feature_views.items():
#     data = feature_view.query.read()
#     data['date'] = pd.to_datetime(data['date']).dt.tz_localize(None)

#     for sensor_id in metadata_df.index:
#         df = data[data["sensor_id"] == sensor_id].copy()

#         # Clean the data before splitting
#         df = df.dropna(subset=["pm25"])
#         # features_for_cleaning = df.drop(columns=["pm25", "date", "city", "sensor_id"])
#         features_for_cleaning = df.drop(columns=["pm25", "date", "sensor_id"])

#         target_for_cleaning = df["pm25"]
#         clean_mask = ~(features_for_cleaning.isna().any(axis=1) | target_for_cleaning.isna())
#         df_clean = df[clean_mask].copy()
#         if len(df_clean) < 10:
#             print(f"‚ö†Ô∏è  Skipping sensor {sensor_id}: insufficient data after cleaning ({len(df_clean)} rows)")
#             continue

#         train_size = int(0.8 * len(df_clean))
#         train_df = df_clean.iloc[:train_size]
#         test_df = df_clean.iloc[train_size:]

#         if len(test_df) < 2:
#             print(f"‚ö†Ô∏è  Skipping sensor {sensor_id}: test set too small after split ({len(test_df)} rows)")
#             continue

#         # Drop non-feature columns (pm25 is target, others are metadata)
#         X_train = train_df.drop(columns=["pm25", "date", "city", "sensor_id"])
#         y_train = train_df["pm25"]
#         X_test = test_df.drop(columns=["pm25", "date", "city", "sensor_id"])
#         y_test = test_df["pm25"]

#         best_r2 = -float('inf')
#         best_mse = float('inf')
#         best_model = None

#         mse_list = []
#         r2_list = []
#         for i in range(5):
#             model = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=165439*i)
#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)
#             r2 = r2_score(y_test, y_pred)
#             mse = mean_squared_error(y_test, y_pred)
#             r2_list.append(r2)
#             mse_list.append(mse)
#             if r2 > best_r2:
#                 best_r2 = r2
#                 best_mse = mse
#                 best_model = model

#         models[feature_name][sensor_id] = best_model

#         if best_model is not None:
#             y_preds[feature_name][sensor_id] = best_model.predict(X_test)
#             results.append({
#                 "feature_name": feature_name,
#                 "sensor_id": sensor_id,
#                 "MSE": sum(mse_list) / len(mse_list),
#                 "R2": sum(r2_list) / len(r2_list),
#                 "train_size": len(X_train),
#                 "test_size": len(X_test),
#             })
#         else:
#             print(f"‚ö†Ô∏è  No valid model trained for {feature_name} - {sensor_id}, R2 scores: {r2_list}, Best R2: {best_r2}")
            

## 3.9. Model Selection and Saving
Identify best performing model for each sensor, save models and feature importance plots, and prepare test data with predictions.

### 3.9.1. Find Best Model per Sensor

In [None]:
results_df = pd.DataFrame(results)
best_models = results_df.loc[results_df.groupby("sensor_id")["R2"].idxmax()]
best_models = best_models.reset_index(drop=False)

print("Best models per sensor:")
print(best_models[["sensor_id", "feature_name", "R2", "MSE"]])

### 3.9.2. Cache all Feature Data

In [None]:
cached_feature_data = {
    name: fv.query.read()
    for name, fv in feature_views.items()
}

# Normalize dates in cached data
for name, df_cached in cached_feature_data.items():
    df_cached["date"] = pd.to_datetime(df_cached["date"]).dt.tz_localize(None)

### 3.9.3. Loop Through Models

In [None]:
all_test_data = []

# --- Loop through best models ---
for _, row in best_models.iterrows():
    sensor_id = row['sensor_id']
    print(sensor_id)

    best_feature = row["feature_name"]

    # --- Save model + feature importance ---
    sensor_dir = f"{model_dir}/{sensor_id}"
    images_dir = f"{sensor_dir}/images"
    os.makedirs(images_dir, exist_ok=True)

    best_model = models[best_feature][sensor_id]
    best_model.save_model(f"{sensor_dir}/model.json")

    plot_importance(best_model)
    plt.savefig(f"{images_dir}/feature_importance.png")
    plt.close()

    # --- Load cached feature view data ---
    df = cached_feature_data[best_feature]
    df = df[df["sensor_id"] == sensor_id].copy()

    # --- EXACT same cleaning as training ---
    df = df.sort_values("date").reset_index(drop=True)
    df = df.dropna(subset=["pm25"])

    feature_cols = [
        c for c in df.columns
        if c not in ["pm25", "date", "sensor_id", "city", "street", "country", "latitude", "longitude", "aqicn_url"]
    ]

    # Keep rows with NaN in features - same as training (XGBoost handles NaN)
    df_clean = df.copy()
    if len(df_clean) < 10:
        continue

    # --- Generate predictions for FULL dataset ---
    X_full = df_clean[feature_cols]
    predictions_full = best_model.predict(X_full)
    df_clean["predicted_pm25"] = predictions_full
    df_clean["best_model"] = best_feature
    
    # --- Filter to last 18 months for hindcast visualization ---
    cutoff_date = pd.Timestamp.now() - pd.DateOffset(months=18)
    df_hindcast = df_clean[df_clean["date"] >= cutoff_date].copy()
    
    if len(df_hindcast) == 0:
        print(f"‚ö†Ô∏è Skipping sensor {sensor_id}: no data in last 18 months")
        continue

   # Get lat/lon AND city/street from metadata
    if sensor_id in aq_data.index:
        df_clean["latitude"] = aq_data.at[sensor_id, "latitude"]
        df_clean["longitude"] = aq_data.at[sensor_id, "longitude"]
        df_clean["city"] = aq_data.at[sensor_id, "city"]
        df_clean["street"] = aq_data.at[sensor_id, "street"]
        df_clean["sensor_id"] = sensor_id
    else:
        print(f"‚ö†Ô∏è Skipping sensor {sensor_id}: metadata not found")
        continue

    all_test_data.append(
        df_clean[["date", "sensor_id", "pm25", "predicted_pm25", "latitude", "longitude", "city", "street", "best_model"]]
    )

In [None]:
# --- DIAGNOSTIC: Check data coverage by year ---
print("\nüìä DATA COVERAGE DIAGNOSTIC")
print("="*60)

for feature_name, df_cached in cached_feature_data.items():
    print(f"\n{feature_name}:")
    
    # Check one sensor as example
    sample_sensor = df_cached["sensor_id"].iloc[0]
    sensor_data = df_cached[df_cached["sensor_id"] == sample_sensor].copy()
    sensor_data["year"] = pd.to_datetime(sensor_data["date"]).dt.year
    
    print(f"  Sensor {sample_sensor}:")
    print(f"  Total rows: {len(sensor_data)}")
    print(f"  Date range: {sensor_data['date'].min()} to {sensor_data['date'].max()}")
    
    # Check pm25 by year
    yearly_stats = sensor_data.groupby("year").agg({
        "pm25": ["count", lambda x: x.isna().sum()]
    })
    yearly_stats.columns = ["Total Rows", "NaN Count"]
    yearly_stats["Non-NaN"] = yearly_stats["Total Rows"] - yearly_stats["NaN Count"]
    print("\n  PM2.5 availability by year:")
    print(yearly_stats)
    
    # After dropna
    sensor_data_clean = sensor_data.dropna(subset=["pm25"])
    if len(sensor_data_clean) > 0:
        print(f"\n  After dropna(subset=['pm25']):")
        print(f"    Remaining rows: {len(sensor_data_clean)}")
        print(f"    Date range: {sensor_data_clean['date'].min()} to {sensor_data_clean['date'].max()}")
        print(f"    Years present: {sorted(sensor_data_clean['year'].unique())}")
    else:
        print(f"\n  ‚ö†Ô∏è NO DATA LEFT after dropna!")
    
    break  # Only check first feature view for now

print("\n" + "="*60)

In [None]:
# # Find best model (highest R2) for each sensor
# results_df = pd.DataFrame(results)
# best_models = results_df.loc[results_df.groupby('sensor_id')['R2'].idxmax()]
# best_models = results_df.loc[results_df.groupby('sensor_id')['R2'].idxmax()]

# print("Best models per sensor:")
# print(best_models[['sensor_id', 'feature_name', 'R2', 'MSE']])

# all_data = baseline_features.read()
# all_data['date'] = pd.to_datetime(all_data['date']).dt.tz_localize(None)

# cached_feature_data = {
#     name: fv.query.read()
#     for name, fv in feature_views.items()
# }

# all_test_data = []

# for _, row in best_models.iterrows():
#     sensor_id = row['sensor_id']
#     best_feature = row['feature_name']
    
#     sensor_dir = f"{model_dir}/{sensor_id}"
#     if not os.path.exists(sensor_dir):
#         os.mkdir(sensor_dir)
#     images_dir = f"{model_dir}/{sensor_id}/images"
#     if not os.path.exists(images_dir):
#         os.mkdir(images_dir)

#     best_model = models[best_feature][sensor_id]
#     model_path = f"{sensor_dir}/model.json"
#     plot_importance(best_model)
#     importance_path = f"{images_dir}/feature_importance.png"
#     plt.savefig(importance_path)
#     plt.close()
    
#     best_model.save_model(model_path)

#     # Use the same feature view and data processing logic that was used for training
#     best_feature_view = feature_views[best_feature]
#     sensor_data = cached_feature_data[best_feature]
#     sensor_data['date'] = pd.to_datetime(sensor_data['date']).dt.tz_localize(None)
    
#     df = sensor_data[sensor_data['sensor_id'] == sensor_id].copy()
    
#     df = df.sort_values("date").reset_index(drop=True)

#     # Apply EXACT same cleaning logic as in training loop
#     df = df.dropna(subset=['pm25'])
    
#     # Recompute feature columns exactly as in training
#     feature_cols = [c for c in df.columns if c not in ["pm25", "date", "sensor_id", "location_id"]]

#     # Drop rows with NaN in any feature
#     df_clean = df.dropna(subset=feature_cols).copy()

#     if len(df_clean) < 10:
#         continue

#     # Same split as training
#     train_size = int(0.8 * len(df_clean))
#     test_df = df_clean.iloc[train_size:].copy()

#     # # Create feature matrix for comprehensive NaN cleaning (same as training)
#     # features_for_cleaning = df.drop(columns=["pm25", "date", "sensor_id"])
#     # target_for_cleaning = df["pm25"]
    
#     # # Remove rows with NaN values in any feature or target (same as training)
#     # clean_mask = ~(features_for_cleaning.isna().any(axis=1) | target_for_cleaning.isna())
#     # df_clean = df[clean_mask].copy()
    
#     # if len(df_clean) < 10:
#     #     continue
        
#     # # Split the cleaned data (same as training)
#     # train_size = int(0.8 * len(df_clean))
#     # test_df = df_clean.iloc[train_size:].copy()
    
#     if len(test_df) == 0:
#         continue
    
#     # Test data is already clean from the comprehensive cleaning above
#     clean_test_df = test_df.copy()
#     predictions = y_preds[best_feature][sensor_id]
    
#     if len(clean_test_df) == len(predictions):
#         clean_test_df['predicted_pm25'] = predictions
#         clean_test_df['best_model'] = best_feature
#         all_test_data.append(clean_test_df[['date', 'pm25', 'predicted_pm25', 'latitude', 'longitude', 'best_model']])
#     else:
#         print(f"‚ö†Ô∏è  Skipping sensor {sensor_id}: prediction length mismatch ({len(predictions)} vs {len(clean_test_df)})")

## 3.10. Model Registration & Visualization
Create prediction plots for each sensor, register model in Hopsworks model registry with metrics and save models with their configuration.

### 3.10.1. Setup

In [None]:
print("Columns in df:", df.columns.tolist())
print("df shape:", df.shape)

In [None]:
mr = project.get_model_registry()

df = pd.concat(all_test_data, ignore_index=True) if all_test_data else pd.DataFrame()
df = df.sort_values(by=["date"])
df_by_sensor = {sid: g.copy() for sid, g in df.groupby("sensor_id")}

### 3.10.2. Precompute Best Model per Sensor

In [None]:
# best_model_per_sensor = (
#     results_df.sort_values("R2", ascending=False)
#               .groupby("sensor_id")
#               .first()[["feature_name", "R2", "MSE"]]
#               .rename(columns={"feature_name": "best_model"})
# )

### 3.10.3. Precompute Training Dataset Versions per Model Type

In [None]:
training_versions = {}

for model_name, fv in feature_views.items():
    try:
        # Check if a training dataset already exists
        existing = fv.get_training_data_versions()
        if existing:
            version = existing[-1]   # latest version
            print(f"‚ÑπÔ∏è Using existing training dataset for {model_name}: version {version}")
            training_versions[model_name] = version
            continue

        # Otherwise create a new one
        td_version, td_job = fv.create_training_data(
            description=f"Training data for model type {model_name}",
            data_format="csv",
            write_options={"wait_for_job": True}
        )

        version = td_version.version if hasattr(td_version, "version") else td_version
        training_versions[model_name] = version
        print(f"‚úÖ Created training dataset for {model_name}: version {version}")

    except Exception as e:
        print(f"‚ö†Ô∏è Could not create training dataset for model type {model_name}: {e}")
        training_versions[model_name] = None


In [None]:
# training_versions = {}

# for model_name, fv in feature_views.items():
#     try:
#         td_version, td_job = fv.create_training_data(
#             description=f"Training data for model type {model_name}",
#             data_format="csv",
#             write_options={"wait_for_job": True}
#         )

#         # Handle both possible return types
#         if hasattr(td_version, "version"):
#             training_versions[model_name] = td_version.version
#         else:
#             # td_version is already an int
#             training_versions[model_name] = td_version

#     except Exception as e:
#         print(f"‚ö†Ô∏è Could not create training dataset for model type {model_name}: {e}")
#         training_versions[model_name] = None


### 3.10.4. Loop over Sensors

In [None]:
for sensor_id in df_by_sensor.keys():  # Loop through sensors that have predictions
    df_subset = df_by_sensor[sensor_id].copy()
    
    # Get city/street directly from the data
    city = df_subset['city'].iloc[0]
    street = df_subset['street'].iloc[0]

    # Get best model info
    if sensor_id not in best_models.index:
        continue

    best_model_name = best_models.loc[sensor_id, "best_model"]
    best_model_r2 = best_models.loc[sensor_id, "R2"]
    best_model_mse = best_models.loc[sensor_id, "MSE"]

    # Get feature view for this model type
    best_model_feature_view = feature_views[best_model_name]

    # Get precomputed training dataset version
    training_dataset_version = training_versions.get(best_model_name, None)

    # Drop unnecessary columns for visualization
    df_subset = df_subset.drop(columns=["latitude", "longitude", "best_model", "sensor_id", "city", "street"], errors="ignore")

    images_dir = f"{model_dir}/{sensor_id}/images"
    os.makedirs(images_dir, exist_ok=True)
    image_path = f"{images_dir}/hindcast_training.png"

    fig = visualization.plot_air_quality_forecast(
        city, street, df_subset, image_path, hindcast=True
    )
    if fig is not None:
        fig.suptitle(f"{city} {street} (Best Model: {best_model_name})")
        plt.close(fig)

    # Register model
    model_kwargs = {
        "name": f"air_quality_xgboost_model_{sensor_id}",
        "metrics": {
            "R2": best_model_r2,
            "MSE": best_model_mse,
        },
        "feature_view": best_model_feature_view,
        "description": (
            f"Air Quality (PM2.5) predictor for {city} {street} "
            f"using {best_model_name} configuration"
        ),
    }

    if training_dataset_version is not None:
        model_kwargs["training_dataset_version"] = training_dataset_version

    aq_model = mr.python.create_model(**model_kwargs)
    aq_model.save(f"{model_dir}/{sensor_id}")

print("‚úÖ All models registered and visualizations generated.")


In [None]:
# mr = project.get_model_registry()
# df = pd.concat(all_test_data, ignore_index=True) if all_test_data else pd.DataFrame()
# df = df.sort_values(by=["date"])

# # Plot the best model for each sensor
# for sensor_id, meta in metadata_df.iterrows():
#     city = meta["city"]
#     street = meta["street"]
#     latitude = meta["latitude"]
#     longitude = meta["longitude"]
    
#     df_subset = df[(df["latitude"] == latitude) & (df["longitude"] == longitude)].copy()
#     if len(df_subset) == 0:
#         continue
    
#     # Get the best model name for display
#     best_model_name = df_subset['best_model'].iloc[0] if 'best_model' in df_subset.columns else 'unknown'
#     best_model_r2 = df_subset['R2'].iloc[0] if 'R2' in df_subset.columns else 0
#     best_model_mse = df_subset['MSE'].iloc[0] if 'MSE' in df_subset.columns else 0
#     best_model_feature_view = feature_views[best_model_name]
    
#     df_subset = df_subset.sort_values(by=["date"])
#     df_subset = df_subset.drop(columns=["latitude", "longitude", "best_model"])
    
#     images_dir = f"{model_dir}/{sensor_id}/images"
#     image_path = f"{images_dir}/hindcast_training.png"
    
#     fig = visualization.plot_air_quality_forecast(
#         city, street, df_subset, image_path, hindcast=True
#     )
#     if fig is not None:
#         fig.suptitle(f"{city} {street} (Best Model: {best_model_name})")
#         plt.close(fig)  # Clean up after saving

#     aq_model = mr.python.create_model(
#         name=f"air_quality_xgboost_model_{sensor_id}",
#         metrics={
#             "R2": best_model_r2,
#             "MSE": best_model_mse,
#         },
#         feature_view=best_model_feature_view,
#         description=f"Air Quality (PM2.5) predictor for {city} {street} using {best_model_name} configuration",
#     )

#     aq_model.save(f"{model_dir}/{sensor_id}")

## 3.11. Upload visuals to HopsFS

In [None]:
dataset_api = project.get_dataset_api()

base_dir = "Resources/plots"
try:
    dataset_api.mkdir(base_dir)
except:
    pass

uploaded_images = 0

for sensor_id in sensor_locations.keys():
    sensor_dir = f"{base_dir}/{sensor_id}"
    try:
        dataset_api.mkdir(sensor_dir)
    except:
        pass

    local_path = f"{model_dir}/{sensor_id}/images/hindcast_training.png"
    remote_path = f"{sensor_dir}/hindcast_training.png"

    ok = hopsworks_admin.safe_upload(dataset_api, local_path, remote_path)
    if ok:
        uploaded_images += 1
        print(f"Uploaded image for sensor {sensor_id} ({uploaded_images}/{len(sensor_locations)})")
    else:
        print(f"‚ùå Giving up on sensor {sensor_id} after repeated failures")

    # dataset_api.upload( 
    #     local_path, remote_path, overwrite=True
    # )
    # uploaded_images = uploaded_images + 1
    # print(f"Uploaded image for sensor {sensor_id}, number {uploaded_images} / {len(sensor_locations)}")

print(f"Done uploading {uploaded_images} images.")