# 3. Training Pipeline

## 3.1. Setup

### 3.1.1. Import Libraries

In [None]:
# Standard imports
import os
from pathlib import Path
import sys
import json
import time
from datetime import date, datetime, timedelta
from dotenv import load_dotenv
import warnings

warnings.filterwarnings("ignore", module="IPython")
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import numpy as np
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from confluent_kafka import KafkaException
from hsfs.client.exceptions import RestAPIError
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from scipy.spatial.distance import cdist

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

today = datetime.today().date()


### 3.1.2. Load Settings and Initialize Hopsworks Connection 

In [None]:

def detect_environment():
    if (
        "HOPSWORKS_JOB_ID" in os.environ
        or "HOPSWORKS_PROJECT_ID" in os.environ
        or "HOPSWORKS_JOB_NAME" in os.environ
    ):
        return "job"

    cwd = os.getcwd()
    if cwd.startswith("/hopsfs/Jupyter"):
        return "jupyter"

    return "local"

env = detect_environment()
print(f"Detected environment: {env}")

# Load secrets based on environment

if env in ("job", "jupyter"):
    project = hopsworks.login()
    secrets_api = hopsworks.get_secrets_api()

    for key in ["HOPSWORKS_API_KEY", "AQICN_API_KEY", "GH_PAT", "GH_USERNAME"]:
        os.environ[key] = secrets_api.get_secret(key).value

else:
    load_dotenv()

# Load Pydantic settings

settings = config.HopsworksSettings()

HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks using the API key

project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

print("Environment initialized and Hopsworks connected!")


### 3.1.3. Repository Management

In [None]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

### 3.1.4. Configure API Keys and Secrets

In [None]:
secrets = hopsworks.get_secrets_api()

try:
    secrets.get_secret("AQICN_API_KEY")
except:
    secrets.create_secret("AQICN_API_KEY", settings.AQICN_API_KEY.get_secret_value())

### 3.1.5. Get Model Registry

In [None]:
mr = project.get_model_registry()

## 3.2. Load Feature Groups & Sensor Locations

In [None]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

# Load data from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)


sensor_locations = metadata.get_sensor_locations(air_quality_fg)
print(f"üìç Loaded locations for {len(sensor_locations)} sensors")

## 3.3. Create Additional Feature Views
Create multiple feature views with different feature combinations (baseline, rolling windows, lagged features, nearby sensors, complete) for model comparison.

In [None]:
# Select features for training data
baseline_features = air_quality_fg.select(["pm25", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])

baseline_feature_view = fs.get_or_create_feature_view(
    name="air_quality_baseline_fv",
    description="Weather features for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=baseline_features,
)

rolling_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
rolling_feature_view = fs.get_or_create_feature_view(
    name="air_quality_rolling_fv",
    description="Weather features, PM2.5 rolling window (3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=rolling_features,
)

nearby_features = air_quality_fg.select(["pm25", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
nearby_feature_view = fs.get_or_create_feature_view(
    name="air_quality_nearby_fv",
    description="Weather features, PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=nearby_features,
)

lagged_1d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_1d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_1d_fv",
    description="Weather features, PM2.5 lags (1d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_1d_features,
)

lagged_2d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_2d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_2d_fv",
    description="Weather features, PM2.5 lags (1d, 2d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_2d_features,
)

lagged_3d_features = air_quality_fg.select(["pm25", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
lagged_3d_feature_view = fs.get_or_create_feature_view(
    name="air_quality_lagged_3d_fv",
    description="Weather features, PM2.5 lags (1d, 2d, 3d) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=lagged_3d_features,
)

complete_features = air_quality_fg.select(["pm25", "pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d", "pm25_nearby_avg", "date", "sensor_id"]).join(
    weather_fg.select_features(), on=["sensor_id", "date"])
complete_feature_view = fs.get_or_create_feature_view(
    name="air_quality_complete_fv",
    description="Weather features, PM2.5 rolling window (3d), and PM2.5 lags (1d, 2d, 3d), and PM2.5 nearby average (1d lag, 3 sensors) for PM2.5 prediction",
    version=1,
    labels=["pm25"],
    query=complete_features,
)

## 3.4. Training Setup

### 3.4.1. Define Training Directory

In [None]:
model_dir = f"{root_dir}/models"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
print(f"Model directory: {model_dir}")

### 3.4.2. Define Model Hyperparameters

In [None]:
TARGET = "pm25"

TRAIN_RATIO = 0.8
MIN_ROWS = 10
MIN_TEST_ROWS = 2

EXCLUDE_COLS = [
    "pm25","date","sensor_id","city","street","country",
    "latitude","longitude","aqicn_url"
]

N_RESTARTS = 5
BASE_SEED = 165439

xgb_params = {
    "n_estimators": 100,
    "learning_rate": 0.05,
}

### 3.4.3. Initialize Containers for Results

In [None]:
# Prepare containers to store models, predictions, and results
models = defaultdict(dict)
y_preds = defaultdict(dict)
results = []

# Define feature view dictionary for iteration
feature_views = {
    "baseline": baseline_feature_view,
    "rolling": rolling_feature_view,
    "nearby": nearby_feature_view,
    "lagged_1d": lagged_1d_feature_view,
    "lagged_2d": lagged_2d_feature_view,
    "lagged_3d": lagged_3d_feature_view,
    "complete": complete_feature_view,
}

## 3.5. Training Loop
Train XGBoost models for each feature combination and sensor, run 5 iterations per configuration, select best model based on R2 score, and store results.

### 3.5.1. Load Feature View Data

In [None]:
feature_data_cache = {}
total_views = len(feature_views)

for i, (feature_name, feature_view) in enumerate(feature_views.items(), start=1):
    print(f"Reading ({i}/{total_views}): {feature_name}")

    df = feature_view.query.read()
    df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)
    feature_data_cache[feature_name] = df

    print(f"    ‚úî Loaded {len(df):,} rows")

print(f"\n‚úÖ Loaded all {total_views} feature views")

### 3.5.2. Build Task List

In [None]:
tasks = []
total_views = len(feature_data_cache)

print(f"Building task list from {total_views} feature views:")

for i, (feature_name, df) in enumerate(feature_data_cache.items(), start=1):
    sensor_ids = df["sensor_id"].unique()
    count = len(sensor_ids)

    print(f"[{i}/{total_views}] {feature_name}: {count} sensors")

    for sensor_id in sensor_ids:
        tasks.append((feature_name, sensor_id))

print(f"\n‚úÖ {len(tasks):,} total training tasks")

### 3.5.3. Training Loop

In [None]:
total = len(tasks)

for idx, (feature_name, sensor_id) in enumerate(tasks, start=1):
    df = feature_data_cache[feature_name]
    df = df[df["sensor_id"] == sensor_id].dropna(subset=[TARGET])

    if len(df) < MIN_ROWS:
        continue

    feature_cols = [c for c in df.columns if c not in EXCLUDE_COLS]

    train_size = int(TRAIN_RATIO * len(df))
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    if len(test_df) < MIN_TEST_ROWS:
        continue

    X_train = train_df[feature_cols]
    y_train = train_df[TARGET]
    X_test = test_df[feature_cols]
    y_test = test_df[TARGET]

    best_model = None
    best_r2 = -1e9
    best_mse = 1e9

    for i in range(N_RESTARTS):
        model = XGBRegressor(
            n_estimators=xgb_params["n_estimators"],
            learning_rate=xgb_params["learning_rate"],
            random_state=BASE_SEED*i
        )
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        r2 = r2_score(y_test, pred)
        mse = mean_squared_error(y_test, pred)

        if r2 > best_r2:
            best_r2 = r2
            best_mse = mse
            best_model = model

    # Store results
    models.setdefault(feature_name, {})[sensor_id] = best_model
    y_preds.setdefault(feature_name, {})[sensor_id] = best_model.predict(X_test)
    results.append({
        "feature_name": feature_name,
        "sensor_id": sensor_id,
        "R2": best_r2,
        "MSE": best_mse,
        "train_size": len(train_df),
        "test_size": len(test_df),
    })

    if idx % 10 == 0 or idx == total:
        print(f"[{idx}/{total}] Trained {feature_name} / sensor {sensor_id}: R¬≤={best_r2:.3f}, MSE={best_mse:.2f}")

print(f"\n‚úÖ Training complete: {len(results)} models trained")

## 3.6. Model Selection

### 3.6.1. Identify Best Model per Sensor

In [None]:
results_df = pd.DataFrame(results)

best_models = results_df.loc[results_df.groupby("sensor_id")["R2"].idxmax()]
best_models = best_models.set_index("sensor_id")

print(f"Identified best models for {len(best_models)} sensors:\n")

summary = best_models[["feature_name", "R2", "MSE"]].sort_index()

print(summary.to_string())


### 3.6.2. Load Feature View Data

In [None]:
total_views = len(feature_views)
print(f"Loading {total_views} feature views...\n")

cached_feature_data = {}

for i, (name, fv) in enumerate(feature_views.items(), start=1):
    print(f"[{i}/{total_views}] Reading feature view: {name}...")
    df_cached = fv.query.read()
    cached_feature_data[name] = df_cached
    print(f"    ‚úî Loaded {len(df_cached):,} rows")

print("\nNormalizing date columns...\n")

for name, df_cached in cached_feature_data.items():
    df_cached["date"] = pd.to_datetime(df_cached["date"]).dt.tz_localize(None)
    print(f"    ‚úî Normalized dates for '{name}'")

print("\nAll feature views loaded and normalized")


### 3.9.3. Generate Predictions

In [None]:
all_test_data = []
total_sensors = len(best_models)

print(f"Processing {total_sensors} sensors...\n")

for idx, (_, row) in enumerate(best_models.iterrows(), start=1):
    sensor_id = row.name
    best_feature = row["feature_name"]

    status = []  # collect short status flags

    # --- Save model + feature importance ---
    sensor_dir = f"{model_dir}/{sensor_id}"
    images_dir = f"{sensor_dir}/images"
    os.makedirs(images_dir, exist_ok=True)

    best_model = models[best_feature][sensor_id]
    best_model.save_model(f"{sensor_dir}/model.json")
    plot_importance(best_model)
    plt.savefig(f"{images_dir}/feature_importance.png")
    plt.close()
    status.append("model+plot")

    # --- Load cached feature view data ---
    df = cached_feature_data[best_feature]
    df = df[df["sensor_id"] == sensor_id].copy()
    df = df.sort_values("date").reset_index(drop=True)
    df = df.dropna(subset=["pm25"])

    feature_cols = [
        c for c in df.columns
        if c not in ["pm25", "date", "sensor_id", "city", "street",
                     "country", "latitude", "longitude", "aqicn_url"]
    ]

    df_clean = df.copy()
    if len(df_clean) < 10:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (too few rows)")
        continue

    # --- Full predictions ---
    X_full = df_clean[feature_cols]
    df_clean["predicted_pm25"] = best_model.predict(X_full)
    df_clean["best_model"] = best_feature
    status.append("predictions")

    # --- Hindcast window ---
    cutoff_date = pd.Timestamp.now() - pd.DateOffset(months=18)
    df_hindcast = df_clean[df_clean["date"] >= cutoff_date].copy()

    if len(df_hindcast) == 0:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (no recent data)")
        continue
    status.append("hindcast")
  
    # --- Metadata ---
    if sensor_id in sensor_locations:
        lat, lon, city, street, country = sensor_locations[sensor_id]
        df_clean["latitude"] = lat
        df_clean["longitude"] = lon
        df_clean["city"] = city
        df_clean["street"] = street
        df_clean["sensor_id"] = sensor_id
        status.append("metadata")
    else:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (no metadata)")
        continue

    # --- Append final data ---
    all_test_data.append(
        df_clean[[
            "date", "sensor_id", "pm25", "predicted_pm25",
            "latitude", "longitude", "city", "street", "best_model"
        ]]
    )
    status.append("saved")

    # --- One-line summary ---
    print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: " + ", ".join(status))

print(f"\nDone. Successfully processed {len(all_test_data)} sensors.")

In [None]:
# --- DIAGNOSTIC: Check data coverage by year ---
print("\nüìä DATA COVERAGE DIAGNOSTIC")
print("="*60)

for feature_name, df_cached in cached_feature_data.items():
    print(f"\n{feature_name}:")
    
    # Check one sensor as example
    sample_sensor = df_cached["sensor_id"].iloc[0]
    sensor_data = df_cached[df_cached["sensor_id"] == sample_sensor].copy()
    sensor_data["year"] = pd.to_datetime(sensor_data["date"]).dt.year
    
    print(f"  Sensor {sample_sensor}:")
    print(f"  Total rows: {len(sensor_data)}")
    print(f"  Date range: {sensor_data['date'].min()} to {sensor_data['date'].max()}")
    
    # Check pm25 by year
    yearly_stats = sensor_data.groupby("year").agg({
        "pm25": ["count", lambda x: x.isna().sum()]
    })
    yearly_stats.columns = ["Total Rows", "NaN Count"]
    yearly_stats["Non-NaN"] = yearly_stats["Total Rows"] - yearly_stats["NaN Count"]
    print("\n  PM2.5 availability by year:")
    print(yearly_stats)
    
    # After dropna
    sensor_data_clean = sensor_data.dropna(subset=["pm25"])
    if len(sensor_data_clean) > 0:
        print(f"\n  After dropna(subset=['pm25']):")
        print(f"    Remaining rows: {len(sensor_data_clean)}")
        print(f"    Date range: {sensor_data_clean['date'].min()} to {sensor_data_clean['date'].max()}")
        print(f"    Years present: {sorted(sensor_data_clean['year'].unique())}")
    else:
        print(f"\n  ‚ö†Ô∏è NO DATA LEFT after dropna!")
    
    break  # Only check first feature view for now

print("\n" + "="*60)

### 3.6.3. Preparation for Visualization and Registration

In [None]:
df = pd.concat(all_test_data, ignore_index=True) if all_test_data else pd.DataFrame()
df = df.sort_values(by=["date"])
df_by_sensor = {sid: g.copy() for sid, g in df.groupby("sensor_id")}

HINDCAST_MONTHS = 18

## 3.7. Visualization
Generate Plots and Hindcats Data

### 3.7.1. Insert Predictions to Feature Group

In [None]:
# Create or get monitoring feature group
monitor_fg = fs.get_or_create_feature_group(
    name="aq_predictions",
    description="Air Quality prediction monitoring from training",
    version=1,
    primary_key=["sensor_id", "date", "days_before_forecast_day"],
    event_time="date",
)

monitoring_predictions = []

print(f"\nPreparing predictions with features for {len(all_test_data)} sensors...\n")

for sensor_df in all_test_data:
    if sensor_df.empty:
        continue
    
    sensor_id = sensor_df["sensor_id"].iloc[0]
    best_feature = sensor_df["best_model"].iloc[0]
    
    # Get full feature data for this sensor
    full_df = cached_feature_data[best_feature]
    full_df = full_df[full_df["sensor_id"] == sensor_id].copy()
    full_df = full_df.sort_values("date").reset_index(drop=True)
    
    # Get hindcast window
    cutoff_date = pd.Timestamp.now() - pd.DateOffset(months=HINDCAST_MONTHS)
    hindcast_df = full_df[full_df["date"] >= cutoff_date].copy()
    
    if hindcast_df.empty:
        continue
    
    # Get predictions from sensor_df
    predictions_merged = hindcast_df.merge(
        sensor_df[["date", "predicted_pm25"]], 
        on="date", 
        how="left"
    )
    
    # Prepare monitoring record with engineered features
    # Use .get() method which returns the column if it exists, otherwise creates a Series of NaN
    pred_df = predictions_merged[["date", "predicted_pm25"]].copy()
    pred_df["sensor_id"] = sensor_id
    pred_df["days_before_forecast_day"] = 0.0  # 0 for training/hindcast
    
    # Add engineered features if they exist in the source data
    pred_df["predicted_pm25_rolling_3d"] = predictions_merged.get("pm25_rolling_3d", pd.Series([np.nan] * len(predictions_merged)))
    pred_df["predicted_pm25_lag_1d"] = predictions_merged.get("pm25_lag_1d", pd.Series([np.nan] * len(predictions_merged)))
    pred_df["predicted_pm25_lag_2d"] = predictions_merged.get("pm25_lag_2d", pd.Series([np.nan] * len(predictions_merged)))
    pred_df["predicted_pm25_lag_3d"] = predictions_merged.get("pm25_lag_3d", pd.Series([np.nan] * len(predictions_merged)))
    pred_df["predicted_pm25_nearby_avg"] = predictions_merged.get("pm25_nearby_avg", pd.Series([np.nan] * len(predictions_merged)))
    
    monitoring_predictions.append(pred_df)
    # print(f"  Prepared {len(pred_df)} predictions for sensor {sensor_id}")
print(f"Prepared {len(monitoring_predictions)} sensors' predictions.")

# Combine all predictions
if monitoring_predictions:
    all_monitoring_df = pd.concat(monitoring_predictions, ignore_index=True)
    
    # Ensure correct data types to match feature group schema
    all_monitoring_df["date"] = pd.to_datetime(all_monitoring_df["date"])
    all_monitoring_df["sensor_id"] = all_monitoring_df["sensor_id"].astype(int)
    all_monitoring_df["predicted_pm25"] = all_monitoring_df["predicted_pm25"].astype(float)
    all_monitoring_df["days_before_forecast_day"] = all_monitoring_df["days_before_forecast_day"].astype(float)
    
    # Also convert the predicted feature columns to float
    for col in ["predicted_pm25_rolling_3d", "predicted_pm25_lag_1d", 
                "predicted_pm25_lag_2d", "predicted_pm25_lag_3d", "predicted_pm25_nearby_avg"]:
        if col in all_monitoring_df.columns:
            all_monitoring_df[col] = all_monitoring_df[col].astype(float)
    
    print(f"\nüìä Total predictions to insert: {len(all_monitoring_df)}")
    # print(f"\nSample data:")
    # print(all_monitoring_df.head())
    # print(f"\nData types:")
    # print(all_monitoring_df.dtypes)
    
    # Insert into feature group with retry logic
    print(f"\nInserting {len(all_monitoring_df)} predictions into aq_predictions feature group...")

    BATCH_SIZE = 2000
    MAX_RETRIES = 5
    INITIAL_BACKOFF = 2  # seconds

    num_batches = (len(all_monitoring_df) + BATCH_SIZE - 1) // BATCH_SIZE
    successful_batches = 0
    failed_batches = []

    for batch_idx, start in enumerate(range(0, len(all_monitoring_df), BATCH_SIZE), start=1):
        end = min(start + BATCH_SIZE, len(all_monitoring_df))
        batch = all_monitoring_df.iloc[start:end]
        
        success = False
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                monitor_fg.insert(batch, write_options={"wait_for_job": False})
                success = True
                successful_batches += 1
                print(f"‚úÖ Batch {batch_idx}/{num_batches} [{start}:{end}] inserted successfully")
                break
            except (ProtocolError, ConnectionError, Timeout, RequestException) as e:
                if attempt < MAX_RETRIES:
                    backoff_time = INITIAL_BACKOFF * (2 ** (attempt - 1))  # Exponential backoff
                    print(f"‚ö†Ô∏è Batch {batch_idx}/{num_batches} attempt {attempt}/{MAX_RETRIES} failed: {type(e).__name__}")
                    print(f"   Retrying in {backoff_time}s...")
                    time.sleep(backoff_time)
                else:
                    print(f"‚ùå Batch {batch_idx}/{num_batches} FAILED after {MAX_RETRIES} attempts")
                    failed_batches.append((batch_idx, start, end))
        
        # Small delay between batches to avoid overwhelming the server
        if success and batch_idx < num_batches:
            time.sleep(0.5)
    
    print(f"\n{'='*60}")
    print(f"‚úÖ Successfully inserted {successful_batches}/{num_batches} batches")
    
    if failed_batches:
        print(f"‚ùå Failed batches: {len(failed_batches)}")
        for batch_idx, start, end in failed_batches:
            print(f"   Batch {batch_idx}: rows {start}-{end}")
    
    # Verify insertion
    print("\nüîç Verifying insertion...")
    time.sleep(5)  # Wait longer for materialization
    
    verification_df = monitor_fg.read()
    print(f"‚úÖ Feature group now contains {len(verification_df)} total rows")
    print(f"   Training predictions prepared: {len(all_monitoring_df)}")
    
else:
    print("‚ö†Ô∏è No predictions to insert")


In [None]:
all_test_data = []
total_sensors = len(best_models)

print(f"Generating visualizations for {total_sensors} sensors...\n")

for idx, (_, row) in enumerate(best_models.iterrows(), start=1):
    sensor_id = row.name
    best_feature = row["feature_name"]

    status = []

    # Load cached feature view data
    df = cached_feature_data[best_feature]
    df = df[df["sensor_id"] == sensor_id].copy()
    df = df.sort_values("date").reset_index(drop=True)
    df = df.dropna(subset=[TARGET])

    feature_cols = [c for c in df.columns if c not in EXCLUDE_COLS]

    if len(df) < MIN_ROWS:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (too few rows)")
        continue

    # Generate predictions for full dataset
    model_obj = models[best_feature][sensor_id]
    df["predicted_pm25"] = model_obj.predict(df[feature_cols])
    df["best_model"] = best_feature
    status.append("predictions")

    # Hindcast window (last N months)
    cutoff_date = pd.Timestamp.now() - pd.DateOffset(months=HINDCAST_MONTHS)
    df_hindcast = df[df["date"] >= cutoff_date].copy()

    if df_hindcast.empty:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (no recent data)")
        continue
    status.append("hindcast")

    # Attach metadata
    if sensor_id in sensor_locations:
        lat, lon, city, street, country = sensor_locations[sensor_id]
        df["latitude"] = lat
        df["longitude"] = lon
        df["city"] = city
        df["street"] = street
        status.append("metadata")
    else:
        print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: skipped (no metadata)")
        continue

    # Save plots
    sensor_dir = f"{model_dir}/{sensor_id}"
    images_dir = f"{sensor_dir}/images"
    os.makedirs(images_dir, exist_ok=True)

    # Feature importance plot
    plot_importance(model_obj)
    plt.savefig(f"{images_dir}/feature_importance.png")
    plt.close()
    status.append("feature_importance")

    # Hindcast plot
    fig = visualization.plot_air_quality_forecast(
        df["city"].iloc[0],
        df["street"].iloc[0],
        df_hindcast,
        f"{images_dir}/hindcast_training.png",
        hindcast=True
    )
    if fig is not None:
        fig.suptitle(f"{df['city'].iloc[0]} {df['street'].iloc[0]} (Best Model: {best_feature})")
        plt.close(fig)
    status.append("hindcast_plot")

    # Append data
    all_test_data.append(
        df[[
            "date", "sensor_id", "pm25", "predicted_pm25",
            "latitude", "longitude", "city", "street", "best_model"
        ]]
    )
    status.append("saved")

    print(f"[{idx}/{total_sensors}] Sensor {sensor_id}: " + ", ".join(status))

print(f"\n‚úÖ Visualization complete: {len(all_test_data)} sensors processed.")

## 3.8. Model Registration

### 3.8.1. Create Training Dataset

In [None]:
# Create training datasets for each feature view
training_datasets = {}

MAX_RETRIES = 10  # Increased for quota issues
INITIAL_BACKOFF = 3
QUOTA_WAIT = 60  # Wait 60 seconds when hitting quota limits

print(f"Creating training datasets for {len(feature_views)} feature views...\n")

for idx, (feature_name, fv) in enumerate(feature_views.items(), start=1):
    success = False
    
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # Create a training dataset version
            version, job = fv.create_training_data(
                description=f"Training data for {feature_name} features",
                data_format="csv",
                write_options={"wait_for_job": True}
            )
            training_datasets[feature_name] = version
            print(f"‚úÖ [{idx}/{len(feature_views)}] {feature_name}: created training dataset v{version}")
            success = True
            break
            
        except RestAPIError as e:
            # Handle quota limits and other API errors
            error_msg = str(e).lower()
            if "quota" in error_msg or "parallel executions" in error_msg:
                if attempt < MAX_RETRIES:
                    print(f"‚ö†Ô∏è {feature_name}: attempt {attempt}/{MAX_RETRIES} - quota limit reached")
                    print(f"   Waiting {QUOTA_WAIT}s for jobs to complete...")
                    time.sleep(QUOTA_WAIT)
                else:
                    print(f"‚ùå {feature_name}: FAILED after {MAX_RETRIES} attempts (quota)")
                    raise
            else:
                if attempt < MAX_RETRIES:
                    backoff_time = INITIAL_BACKOFF * (2 ** (attempt - 1))
                    print(f"‚ö†Ô∏è {feature_name}: attempt {attempt}/{MAX_RETRIES} failed (RestAPIError)")
                    print(f"   Error: {str(e)[:150]}")
                    print(f"   Retrying in {backoff_time}s...")
                    time.sleep(backoff_time)
                else:
                    print(f"‚ùå {feature_name}: FAILED after {MAX_RETRIES} attempts")
                    raise
            
        except (ProtocolError, ConnectionError, Timeout, RequestException, OSError) as e:
            if attempt < MAX_RETRIES:
                backoff_time = INITIAL_BACKOFF * (2 ** (attempt - 1))
                print(f"‚ö†Ô∏è {feature_name}: attempt {attempt}/{MAX_RETRIES} failed ({type(e).__name__}: {str(e)[:100]})")
                print(f"   Retrying in {backoff_time}s...")
                time.sleep(backoff_time)
            else:
                print(f"‚ùå {feature_name}: FAILED after {MAX_RETRIES} attempts")
                raise
        
        except Exception as e:
            # Catch-all for unexpected errors
            if attempt < MAX_RETRIES:
                backoff_time = INITIAL_BACKOFF * (2 ** (attempt - 1))
                print(f"‚ö†Ô∏è {feature_name}: attempt {attempt}/{MAX_RETRIES} failed with unexpected error")
                print(f"   Error: {type(e).__name__}: {str(e)[:100]}")
                print(f"   Retrying in {backoff_time}s...")
                time.sleep(backoff_time)
            else:
                print(f"‚ùå {feature_name}: FAILED after {MAX_RETRIES} attempts with {type(e).__name__}")
                raise
    
    if not success:
        print(f"‚ùå Could not create training dataset for {feature_name}")
        raise RuntimeError(f"Failed to create training dataset for {feature_name}")
    
    # Small delay between feature views
    if idx < len(feature_views):
        time.sleep(2)

print(f"\n‚úÖ All {len(training_datasets)} training datasets created successfully")


### 3.8.2. Register Models

In [None]:
uploaded = 0
total_sensors = len(best_models)

print(f"Registering models for {total_sensors} sensors...\n")

MAX_RETRIES = 5
INITIAL_BACKOFF = 2

for sensor_id, row in best_models.iterrows():

    best_feature = row["feature_name"]
    best_r2 = row["R2"]
    best_mse = row["MSE"]

    # Get trained model
    model_obj = models[best_feature][sensor_id]

    # Save model locally
    sensor_model_dir = f"{model_dir}/{sensor_id}"
    os.makedirs(sensor_model_dir, exist_ok=True)
    model_obj.save_model(f"{sensor_model_dir}/model.json")

    # Register model
    model = mr.python.create_model(
        name=f"air_quality_xgboost_model_{sensor_id}",
        metrics={"R2": best_r2, "MSE": best_mse},
        feature_view=feature_views[best_feature],
        training_dataset_version=training_datasets[best_feature],
        description=f"PM2.5 predictor for sensor {sensor_id} using {best_feature} features",
    )

    success = False
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            model.save(sensor_model_dir)
            success = True
            break
        except (ProtocolError, ConnectionError, Timeout, RequestException) as e:
            if attempt < MAX_RETRIES:
                backoff_time = INITIAL_BACKOFF * (2 ** (attempt - 1))
                print(f"  ‚ö†Ô∏è Sensor {sensor_id}: attempt {attempt}/{MAX_RETRIES} failed ({type(e).__name__})")
                print(f"     Retrying in {backoff_time}s...")
                time.sleep(backoff_time)
            else:
                print(f"  ‚ùå Sensor {sensor_id}: FAILED after {MAX_RETRIES} attempts")

    if success:
        uploaded += 1
        print(f"[{uploaded}/{total_sensors}] Sensor {sensor_id}: registered ({best_feature})")
    else:
        print(f"[--/--] Sensor {sensor_id}: FAILED to register")

print(f"\n‚úÖ Done. {uploaded}/{total_sensors} models successfully registered.")


## 3.9. Upload Plots

In [None]:
dataset_api = project.get_dataset_api()
base_dir = "Resources/plots"
try:
    dataset_api.mkdir(base_dir)
except:
    pass

uploaded_images = 0
total_sensors = len(sensor_locations)

print(f"Uploading plots for {total_sensors} sensors...\n")

for sensor_id in sensor_locations.keys():
    sensor_dir = f"{base_dir}/{sensor_id}"
    try:
        dataset_api.mkdir(sensor_dir)
    except:
        pass

    local_path = f"{model_dir}/{sensor_id}/images/hindcast_training.png"
    remote_path = f"{sensor_dir}/hindcast_training.png"

    ok = hopsworks_admin.safe_upload(dataset_api, local_path, remote_path)

    if ok:
        uploaded_images += 1
        print(f"Uploaded image for sensor {sensor_id} ({uploaded_images}/{total_sensors})")
    else:
        print(f"‚ùå [fail] Sensor {sensor_id}: upload failed after retries")

print(f"‚úÖ Done uploading {uploaded_images} images.")
