# 2. Feature Pipeline

## 2.1. Setup

### 2.1.1. Import Libraries

In [None]:
# Standard imports
import os
from pathlib import Path
import sys
import json
import time
from datetime import date, datetime, timedelta
from dotenv import load_dotenv
import warnings

warnings.filterwarnings("ignore", module="IPython")
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import numpy as np
import great_expectations as gx
import hopsworks
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError, Timeout, RequestException
from confluent_kafka import KafkaException
from hsfs.client.exceptions import RestAPIError
from collections import defaultdict
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from scipy.spatial.distance import cdist

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata, visualization

today = datetime.today().date()

### 2.1.2. Load Settings and Initialize Hopsworks Connection

In [None]:

def detect_environment():
    if (
        "HOPSWORKS_JOB_ID" in os.environ
        or "HOPSWORKS_PROJECT_ID" in os.environ
        or "HOPSWORKS_JOB_NAME" in os.environ
    ):
        return "job"

    cwd = os.getcwd()
    if cwd.startswith("/hopsfs/Jupyter"):
        return "jupyter"

    return "local"

env = detect_environment()
print(f"Detected environment: {env}")

# Load secrets based on environment

if env in ("job", "jupyter"):
    project = hopsworks.login()
    secrets_api = hopsworks.get_secrets_api()

    for key in ["HOPSWORKS_API_KEY", "AQICN_API_KEY", "GH_PAT", "GH_USERNAME"]:
        os.environ[key] = secrets_api.get_secret(key).value

else:
    load_dotenv()

# Load Pydantic settings

settings = config.HopsworksSettings()

HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks using the API key

project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

print("Environment initialized and Hopsworks connected!")


### 2.1.3. Repository management

In [None]:
repo_dir = hopsworks_admin.clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)

### 2.1.3. Configure API Keys and Secrets

In [None]:
secrets = hopsworks.get_secrets_api()

try:
    secrets.get_secret("AQICN_API_KEY")
except:
    secrets.create_secret("AQICN_API_KEY", settings.AQICN_API_KEY.get_secret_value())

## 2.2. Get Feature Groups

In [None]:
air_quality_fg, weather_fg = hopsworks_admin.create_feature_groups(fs)

## 2.3. Load Sensor Locations from Feature Group

In [None]:
# Load data from air_quality feature group
aq_data = air_quality_fg.read()

if len(aq_data) == 0:
    print("‚ö†Ô∏è No air quality data found. Run pipeline 1 (backfill) first.")
    sys.exit(1)

# Build sensor location dictionary: sensor_id -> (lat, lon, city, street, country, aqicn_url)
sensor_locations = {}
existing_aq_data = air_quality_fg.read()
existing_sensors = set(existing_aq_data["sensor_id"].unique())
print(f"üìã Found {len(existing_sensors)} sensors in feature store")


for _, row in existing_aq_data[
    ["sensor_id", "latitude", "longitude", "city", "street", "country", "aqicn_url"]
].drop_duplicates(subset=["sensor_id"]).iterrows():
    
    sensor_locations[row["sensor_id"]] = {
        "latitude": row["latitude"],
        "longitude": row["longitude"],
        "city": row["city"],
        "street": row["street"],
        "country": row["country"],
        "aqicn_url": row["aqicn_url"]
    }


# # Build location dict
# for _, row in existing_aq_data[["sensor_id", "latitude", "longitude", "city", "street", "country", "aqicn_url"]].drop_duplicates(subset=["sensor_id"]).iterrows():
#     sensor_locations[row["sensor_id"]] = (
#         row["latitude"], 
#         row["longitude"], 
#         row["city"], 
#         row["street"], 
#         row["country"],
#         row["aqicn_url"]
#     )
print(f"üìç Loaded locations for {len(sensor_locations)} existing sensors")

## 2.4. Data Collection
Fetch today's air quality data and weather forecasts, format data to match feature group schemas.

### 2.4.1. Load Historical Air Quality Data (Last 4 Days)

In [None]:
print(f"üîç Processing {len(sensor_locations)} sensor locations.")
historical_start = today - timedelta(days=4)

try:
    historical_df = air_quality_fg.read()
    if not historical_df.empty:
        historical_df["date"] = pd.to_datetime(historical_df["date"]).dt.tz_localize(None)
        today_dt = pd.to_datetime(today)
        historical_start_dt = pd.to_datetime(historical_start)
        
        historical_df = historical_df[
            (historical_df["date"] >= historical_start_dt) & 
            (historical_df["date"] <= today_dt) 
        ][["date", "sensor_id", "pm25"]]
        
        historical_df = historical_df[historical_df["sensor_id"].isin(sensor_locations.keys())]
    else:
        historical_df = pd.DataFrame()
except Exception as e:
    print(f"‚ö†Ô∏è Error reading historical data: {e}")
    historical_df = pd.DataFrame()

### 2.4.2. Identify Missing Dates for Backfill

In [None]:
existing_dates = air_quality_fg.read()["date"].dt.date.unique()

today = datetime.today().date()
start_date = today - timedelta(days=7)  # Check last 7 days for missing data

expected_dates = pd.date_range(start=start_date, end=today, freq="D").date
missing_dates = [d for d in expected_dates if d not in existing_dates]

# print(f"üìÖ Missing dates to backfill: {missing_dates}")
formatted = ", ".join(d.isoformat() for d in missing_dates)
print(f"üìÖ Missing dates to backfill: {formatted}")

### 2.4.3. Preparations

In [None]:
# Prepare historical data window
historical_cutoff = pd.to_datetime(min(missing_dates)) - pd.Timedelta(days=3)
historical = air_quality_fg.read()
historical["date"] = pd.to_datetime(historical["date"]).dt.tz_localize(None)
historical = historical [historical["date"] >= historical_cutoff]

# Track existing sensor-date pairs
existing = historical[["sensor_id", "date"]].copy()
existing["date_only"] = existing["date"].dt.date
existing_keys = set(zip(existing["sensor_id"], existing["date_only"]))

# Initialize data containers
all_aq_rows = [historical]
all_weather_rows = []

### 2.4.4. Fetch Missing Air Quality Data

In [None]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching air quality for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in missing_dates:
        if (sensor_id, day) in existing_keys:
            continue

        try:
            aq_df = fetchers.get_pm25(
                meta["aqicn_url"], meta["country"], meta["city"],
                meta["street"], day, AQICN_API_KEY
            )

            if aq_df.empty or aq_df["pm25"].isna().all():
                continue

            aq_df["sensor_id"] = int(sensor_id)
            aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce")
            # aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.tz_localize(None)
            aq_df["date"] = pd.to_datetime(aq_df["date"]).dt.normalize()

            # Add metadata
            aq_df["city"] = meta["city"]
            aq_df["street"] = meta["street"]
            aq_df["country"] = meta["country"]
            aq_df["aqicn_url"] = meta["aqicn_url"]
            aq_df["latitude"] = meta["latitude"]
            aq_df["longitude"] = meta["longitude"]

            aq_df = aq_df.drop(columns=["url"], errors="ignore")

            all_aq_rows.append(aq_df)

        except Exception as e:
            print(f"‚ùå Air quality for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_aq_rows)} air quality dataframes")


### 2.4.5. Fetch Missing Weather Forecast Data

In [None]:
count = 1
total = len(sensor_locations)

for sensor_id, meta in sensor_locations.items():
    print(f"Fetching weather for sensor {sensor_id}, {count}/{total}")
    count += 1

    for day in missing_dates:
        try:
            weather_df = fetchers.get_weather_forecast(
                sensor_id=sensor_id,
                latitude=meta["latitude"],
                longitude=meta["longitude"],
                start_date=day,
                end_date=day + timedelta(days=6)
            )

            if weather_df.empty:
                continue

            weather_df["sensor_id"] = int(sensor_id)
            weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.normalize()
            # weather_df["date"] = (
            #     pd.to_datetime(weather_df["date"])
            #     .dt.normalize()
            #     .dt.tz_localize(None)
            # )

            all_weather_rows.append(weather_df)

        except Exception as e:
            print(f"‚ùå Weather for sensor {sensor_id} on {day}: {type(e).__name__}")

print(f"üìä Collected {len(all_weather_rows)} weather dataframes")


### 2.4.9. Clean and Align Data Structure

In [None]:
cleaned_aq_rows = []

engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
base_cols = [c for c in historical.columns if c not in engineered_cols]

for i, df in enumerate(all_aq_rows):
    if df.empty or "pm25" not in df.columns or df["pm25"].isna().all():
        print(f"‚ö†Ô∏è Skipping empty or invalid df[{i}]")
        continue

    df["date"] = pd.to_datetime(df["date"]).dt.normalize().dt.tz_localize(None)

    # Skip if too few expected columns are present
    if len(set(df.columns) & set(base_cols)) < 3:
        print(f"‚ö†Ô∏è Skipping malformed df[{i}] with columns: {list(df.columns)}")
        continue

    # Align to base columns only (no engineered features yet)
    aligned = df.reindex(columns=base_cols, fill_value=np.nan)

    # Final sanity check
    if aligned.shape[1] != len(base_cols):
        print(f"‚ùå Still malformed after alignment: df[{i}] shape={aligned.shape}")
        continue

    # Force dtype alignment to match historical (for base columns only)
    for col in base_cols:
        if col in historical.columns:
            try:
                aligned[col] = aligned[col].astype(historical[col].dtype, errors="raise")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not cast column '{col}' in df[{i}]: {e}")
                continue

    cleaned_aq_rows.append(aligned)

print(f"üìã Cleaned {len(cleaned_aq_rows)} air quality dataframes")
print(f"üìã Using base columns (excluding engineered features): {len(base_cols)} columns")

### 2.4.10. Combine and Clean Weather Data

In [None]:
if all_weather_rows:
    all_weather = pd.concat(all_weather_rows, ignore_index=True)
    all_weather = all_weather.sort_values(["sensor_id", "date"]).reset_index(drop=True)
    all_weather["date"] = pd.to_datetime(all_weather["date"]).dt.tz_localize(None)
    
    # Remove duplicates
    all_weather = all_weather.drop_duplicates(subset=["sensor_id", "date"], keep="first")
    
    print(f"üå§Ô∏è Total weather records: {len(all_weather)}")
    print(f"üìÖ Weather date range: {all_weather['date'].min()} to {all_weather['date'].max()}")
else:
    all_weather = pd.DataFrame()
    print("‚ö†Ô∏è No weather data collected")

## 2.5. Combine Data and Add Engineered Features

In [None]:
# from utils import feature_engineering

# locations = feature_engineering.build_sensor_location_map(df, sensor_locations)
# print("DEBUG LOCATIONS:", locations)

In [None]:
# Drop engineered columns from historical data before combining
engineered_cols = [c for c in historical.columns if "lag" in c or "rolling" in c or "nearby" in c]
historical_base = historical.drop(columns=engineered_cols, errors="ignore")

# Combine data
all_aq = pd.concat([historical_base, *cleaned_aq_rows], ignore_index=True)
all_aq = all_aq.sort_values(["sensor_id", "date"]).reset_index(drop=True)
all_aq["date"] = pd.to_datetime(all_aq["date"]).dt.normalize().dt.tz_localize(None)

# Remove duplicates: keep the first occurrence of each sensor_id + date combination
all_aq = all_aq.drop_duplicates(subset=["sensor_id", "date"], keep="first").reset_index(drop=True)

print(f"üìä Total records after deduplication: {len(all_aq)}")
print(f"üìä Unique sensors: {all_aq['sensor_id'].nunique()}")
print(f"üìä Date range: {all_aq['date'].min()} to {all_aq['date'].max()}")

# Add engineered features
all_aq = feature_engineering.add_rolling_window_feature(all_aq, window_days=3)
all_aq = feature_engineering.add_lagged_features(all_aq, lags=[1, 2, 3])

# Pass sensor_locations dict to nearby sensor feature
all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, sensor_locations, n_closest=3)

## 2.6. Insert Data to Feature Groups

### 2.6.1. Batch Insert Air Quality Data by Date

In [None]:
for day in missing_dates:
    day_rows = all_aq[all_aq["date"].dt.date == day].copy()
    day_rows = day_rows.dropna(subset=["pm25"])

    engineered_cols = [c for c in day_rows.columns if "lag" in c or "rolling" in c or "nearby" in c]
    day_rows = day_rows.dropna(subset=engineered_cols, how="any")

    if not day_rows.empty:
        # Convert types to match feature group schema
        day_rows = day_rows.astype({
            "sensor_id": "int32",
            "pm25": "float64",
            "pm25_lag_1d": "float64",
            "pm25_lag_2d": "float64",
            "pm25_lag_3d": "float64",
            "pm25_rolling_3d": "float64",
            "pm25_nearby_avg": "float64",
            "city": "string",
            "street": "string",
            "country": "string",
            "aqicn_url": "string",
            "latitude": "float64",
            "longitude": "float64",
        })
        
        # Ensure correct column order
        fg_columns = [f.name for f in air_quality_fg.features]
        day_rows = day_rows[fg_columns]
        
        air_quality_fg.insert(day_rows)
        print(f"‚úÖ Inserted {len(day_rows)} rows for {day}")
    else:
        print(f"‚ö†Ô∏è No valid rows for {day}")

### 2.6.2. Verify Air Quality Insertion

In [None]:
print(all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_rolling_3d", "pm25_nearby_avg"]])

In [None]:
print(all_aq[all_aq["date"].dt.date == today - timedelta(days=1)])

In [None]:
# Debug: Check nearby sensor calculation
print("üîç Debugging nearby sensor feature:")
print(f"Total sensors in all_aq: {all_aq['sensor_id'].nunique()}")
print(f"Total sensors in sensor_locations: {len(sensor_locations)}")

# Check if pm25_lag_1d has values
lag_stats = all_aq[all_aq['date'].dt.date == today]['pm25_lag_1d'].describe()
print(f"\npm25_lag_1d stats for today:")
print(lag_stats)

# Check one sensor specifically
test_sensor = 58666
print(f"\nüîç Checking sensor {test_sensor}:")
sensor_data = all_aq[all_aq['sensor_id'] == test_sensor].tail(5)
print(sensor_data[['sensor_id', 'date', 'pm25', 'pm25_lag_1d', 'pm25_nearby_avg']])

# Check if this sensor has neighbors in the locations dict
if test_sensor in sensor_locations:
    print(f"\nSensor {test_sensor} is in sensor_locations")
    print(f"Lat/Lon: {sensor_locations[test_sensor]['latitude']}, {sensor_locations[test_sensor]['longitude']}")
else:
    print(f"\n‚ö†Ô∏è Sensor {test_sensor} NOT in sensor_locations!")


In [None]:
# # Debug: Manually trace the nearby sensor calculation for one sensor
# from utils.feature_engineering import build_sensor_location_map, compute_closest_sensors

# test_sensor = 58666
# locations = build_sensor_location_map(all_aq, sensor_locations)
# closest_map = compute_closest_sensors(locations, n_closest=3)

# print(f"üîç Closest sensors to {test_sensor}:")
# neighbors = closest_map.get(test_sensor, [])
# print(f"Neighbors: {neighbors}")

# if neighbors:
#     # Get neighbor data
#     neighbor_df = all_aq[all_aq['sensor_id'].isin(neighbors)][['date', 'pm25_lag_1d']]
#     print(f"\nüìä Neighbor data (showing last 10):")
#     print(neighbor_df.tail(10))
    
#     # Group by date
#     neighbor_avg = neighbor_df.groupby('date')['pm25_lag_1d'].mean().reset_index()
#     print(f"\nüìä Neighbor average by date:")
#     print(neighbor_avg.tail(10))
    
#     # Get sensor data and merge
#     sensor_data = all_aq[all_aq['sensor_id'] == test_sensor]
#     print(f"\nüìä Sensor {test_sensor} data:")
#     print(sensor_data[['sensor_id', 'date', 'pm25_lag_1d']].tail(5))
    
#     merged = sensor_data.merge(neighbor_avg, on='date', how='left')
#     print(f"\nüìä After merge:")
#     print(merged[['sensor_id', 'date', 'pm25_lag_1d_x', 'pm25_lag_1d_y']].tail(5))
#     print(f"\n‚úÖ Column 'pm25_lag_1d_y' exists: {'pm25_lag_1d_y' in merged.columns}")


In [None]:
# # Re-import the fixed module
# import importlib
# importlib.reload(feature_engineering)

# # Re-apply the nearby sensor feature with the fixed function
# print("üîÑ Re-calculating nearby sensor averages with fixed function...")
# all_aq = feature_engineering.add_nearby_sensor_feature(all_aq, sensor_locations, n_closest=3)

# print("\n‚úÖ Re-calculated nearby sensor averages")
# print("\nüìä Sample data for today:")
# sample = all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_nearby_avg"]].head(10)
# print(sample)

# print(f"\nüìä Nearby avg stats:")
# print(all_aq["pm25_nearby_avg"].describe())
# print(f"\nüìä Non-null nearby averages: {all_aq['pm25_nearby_avg'].notna().sum()} / {len(all_aq)}")


In [None]:
# # Check today's data more thoroughly
# today_data = all_aq[all_aq["date"].dt.date == today][["sensor_id", "date", "pm25", "pm25_lag_1d", "pm25_nearby_avg"]]
# print(f"üìä Today's data ({today}):")
# print(f"Total records: {len(today_data)}")
# print(f"Records with nearby_avg: {today_data['pm25_nearby_avg'].notna().sum()}")
# print(f"Records without nearby_avg: {today_data['pm25_nearby_avg'].isna().sum()}")

# print("\nüîç Sensors still missing nearby_avg on today:")
# missing_nearby = today_data[today_data['pm25_nearby_avg'].isna()]
# if len(missing_nearby) > 0:
#     print(missing_nearby.head(10))
#     print(f"\n...and {max(0, len(missing_nearby) - 10)} more")
# else:
#     print("None! All sensors have nearby averages ‚úÖ")


### 2.6.3. Batch Insert Weather Forecast Data

In [None]:
if not all_weather.empty:
    # Convert types to match feature group schema
    all_weather = all_weather.astype({
        "sensor_id": "int32",
        "temperature_2m_mean": "float64",
        "precipitation_sum": "float64",
        "wind_speed_10m_max": "float64",
        "wind_direction_10m_dominant": "float64",
    })
    
    # Ensure correct column order
    weather_fg_columns = [f.name for f in weather_fg.features]
    all_weather = all_weather[weather_fg_columns]
    
    # Insert in smaller batches to avoid connection issues
    batch_size = 100
    total_inserted = 0
    
    for i in range(0, len(all_weather), batch_size):
        batch = all_weather.iloc[i:i+batch_size]
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                weather_fg.insert(batch)
                total_inserted += len(batch)
                print(f"‚úÖ Weather batch {i//batch_size + 1}: {len(batch)} records (total: {total_inserted}/{len(all_weather)})")
                break
            except (ProtocolError, ConnectionError, TimeoutError, KafkaException) as e:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"‚ö†Ô∏è Connection error on weather batch {i//batch_size + 1}, retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Failed weather batch {i//batch_size + 1}")
                    failed_file = f"{root_dir}/failed_weather_batch_{today}_{i}.csv"
                    batch.to_csv(failed_file, index=False)
                    print(f"üíæ Saved to {failed_file}")
    
    print(f"üå§Ô∏è Total weather inserted: {total_inserted}/{len(all_weather)} records")
else:
    print("‚ö†Ô∏è No weather data to insert")

### 2.6.4. Print Processing Summary

In [None]:
# print(f"\nüìä Summary: ‚úÖ {successful} successful, ‚è≠Ô∏è {skipped} skipped, ‚ùå {failed} failed")

## 2.7. Inspect Inserted Data

In [None]:
if 'all_aq' in locals() and not all_aq.empty:
    print(f"‚úÖ Air quality records inserted: {len(all_aq)}")
    print("\nüìã Sample air quality data:")
    print(all_aq.head())
    print("\nüîß Air quality data types:")
    print(all_aq.dtypes)
    print("\nüìÖ Date range:")
    print(f"From {all_aq['date'].min()} to {all_aq['date'].max()}")

if 'all_weather' in locals() and not all_weather.empty:
    print(f"\nüå§Ô∏è Weather records inserted: {len(all_weather)}")
    print("\nüìã Sample weather data:")
    print(all_weather.head())
    print("\nüîß Weather data types:")
    print(all_weather.dtypes)
    print("\nüìÖ Unique weather dates:")
    print(all_weather['date'].unique())