In [None]:
# Import libraries for multi-horizon model training
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
import sys
import json
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Any

# Machine Learning
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import optuna

# Deep Learning
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, LSTM, Dropout
    HAS_TENSORFLOW = True
except ImportError:
    HAS_TENSORFLOW = False
    print("⚠️ TensorFlow not available - will skip neural network models")

warnings.filterwarnings('ignore')

# Add utilities
sys.path.append('../src/hourly')
sys.path.append('../src/shared')

# File paths
FEATURES_PATH = '../data/processed/hanoi_weather_hourly_features.csv'
METADATA_PATH = '../data/processed/hourly_feature_metadata.json'
MODELS_OUTPUT_PATH = '../models/hourly_trained/'

print("🤖 MULTI-HORIZON WEATHER FORECASTING")
print("=" * 40)
print("✅ Libraries imported successfully!")
print(f"📂 Features: {FEATURES_PATH}")
print(f"📋 Metadata: {METADATA_PATH}")
print(f"💾 Models output: {MODELS_OUTPUT_PATH}")

# Create output directory
os.makedirs(MODELS_OUTPUT_PATH, exist_ok=True)

## 1️⃣ Load Features & Setup Multi-Horizon Framework

In [None]:
# Load feature-engineered data and setup multi-horizon framework
print("📊 LOADING FEATURE DATA & MULTI-HORIZON SETUP")
print("=" * 47)

# Load features
df_features = pd.read_csv(FEATURES_PATH)
df_features['datetime_processed'] = pd.to_datetime(df_features['datetime_processed'])
df_features.set_index('datetime_processed', inplace=True)

print(f"✅ Features loaded: {df_features.shape}")

# Load metadata
with open(METADATA_PATH, 'r') as f:
    metadata = json.load(f)

print(f"📋 Metadata loaded: {len(metadata)} keys")

# Define forecasting horizons
FORECAST_HORIZONS = {
    '1h': 1,      # Immediate forecasting
    '6h': 6,      # Short-term planning  
    '24h': 24,    # Daily planning
    '72h': 72,    # 3-day forecast
    '168h': 168   # Weekly forecast
}

# Define target variables for multi-variate forecasting
TARGET_VARIABLES = ['temp', 'humidity', 'sealevelpressure', 'windspeed', 'cloudcover']

print(f"🎯 Forecast horizons: {list(FORECAST_HORIZONS.keys())}")
print(f"📊 Target variables: {TARGET_VARIABLES}")

# Feature selection - remove highly correlated and low-importance features
print(f"\n🔧 FEATURE PREPROCESSING")
print("=" * 25)

# Remove non-predictive columns
exclude_cols = ['name', 'address', 'resolvedAddress', 'latitude', 'longitude', 
               'preciptype', 'conditions', 'icon', 'source', 'date']

# Get numeric features only
numeric_cols = df_features.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in numeric_cols if col not in exclude_cols + TARGET_VARIABLES]

print(f"Available numeric features: {len(numeric_cols)}")
print(f"Selected feature columns: {len(feature_cols)}")

# Handle remaining missing values
missing_count = df_features[feature_cols].isnull().sum().sum()
if missing_count > 0:
    print(f"⚠️ Handling {missing_count} missing values...")
    df_features[feature_cols] = df_features[feature_cols].fillna(method='ffill').fillna(method='bfill')
    print("✅ Missing values filled")

print(f"✅ Feature preprocessing completed!")
print(f"📊 Final feature set: {len(feature_cols)} features")
print(f"📅 Date range: {df_features.index.min()} to {df_features.index.max()}")

## 2️⃣ Multi-Horizon Data Preparation

In [None]:
# Prepare datasets for different forecasting horizons
print("⏰ MULTI-HORIZON DATA PREPARATION")
print("=" * 35)

def create_multi_horizon_datasets(df, target_var, feature_cols, horizons, test_size=0.2):
    """Create datasets for multiple forecasting horizons"""
    
    datasets = {}
    
    for horizon_name, horizon_hours in horizons.items():
        print(f"📊 Preparing {horizon_name} dataset (target: {target_var})...")
        
        # Create target variable shifted by horizon
        df_horizon = df.copy()
        df_horizon[f'{target_var}_target'] = df_horizon[target_var].shift(-horizon_hours)
        
        # Remove rows where target is NaN (at the end due to shift)
        df_horizon = df_horizon.dropna(subset=[f'{target_var}_target'])
        
        # Prepare features and target
        X = df_horizon[feature_cols].values
        y = df_horizon[f'{target_var}_target'].values
        
        # Time series split (preserve temporal order)
        split_idx = int(len(X) * (1 - test_size))
        
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Store metadata about the split
        train_dates = df_horizon.index[:split_idx]
        test_dates = df_horizon.index[split_idx:]
        
        datasets[horizon_name] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'train_dates': train_dates,
            'test_dates': test_dates,
            'feature_names': feature_cols,
            'target_name': f'{target_var}_target',
            'horizon_hours': horizon_hours
        }
        
        print(f"   Train shape: {X_train.shape}, Test shape: {X_test.shape}")
        print(f"   Train period: {train_dates.min()} to {train_dates.max()}")
        print(f"   Test period: {test_dates.min()} to {test_dates.max()}")
    
    return datasets

# Create datasets for each target variable
print("🎯 Creating multi-horizon datasets for all targets...")

all_datasets = {}
for target in TARGET_VARIABLES:
    print(f"\n🌡️ Processing target: {target}")
    target_datasets = create_multi_horizon_datasets(
        df_features, target, feature_cols, FORECAST_HORIZONS
    )
    all_datasets[target] = target_datasets

print(f"\n✅ Multi-horizon datasets created!")
print(f"📊 Total datasets: {len(TARGET_VARIABLES)} targets × {len(FORECAST_HORIZONS)} horizons = {len(TARGET_VARIABLES) * len(FORECAST_HORIZONS)}")

# Feature scaling preparation
print(f"\n⚖️ FEATURE SCALING SETUP")
print("=" * 25)

scalers = {}
for target in TARGET_VARIABLES:
    scalers[target] = {}
    for horizon in FORECAST_HORIZONS.keys():
        # Use RobustScaler for weather data (handles outliers better)
        scaler = RobustScaler()
        
        # Fit on training data
        X_train = all_datasets[target][horizon]['X_train']
        scaler.fit(X_train)
        
        # Transform both train and test
        all_datasets[target][horizon]['X_train_scaled'] = scaler.transform(X_train)
        all_datasets[target][horizon]['X_test_scaled'] = scaler.transform(all_datasets[target][horizon]['X_test'])
        
        scalers[target][horizon] = scaler

print("✅ Feature scaling completed for all datasets!")

# 🤖 Hanoi Hourly Weather Model Training - Step 5

This notebook implements multi-horizon forecasting models for hourly weather prediction, comparing different algorithms and prediction windows.

**Multi-Horizon Strategy:**
- **Short-term (1-6h)**: High accuracy for immediate forecasting
- **Medium-term (12-24h)**: Daily planning and pattern recognition
- **Long-term (48-168h)**: Weekly trends and seasonal patterns
- **Model Comparison**: XGBoost, LightGBM, CatBoost, Neural Networks
- **Evaluation**: MAE, RMSE, MAPE for different horizons

---