# Gweizy Model Training Notebook

Train all gas prediction models for Gweizy.

## Instructions:
1. Upload your `gas_data.db` file (from `backend/gas_data.db`)
2. Run all cells
3. Download the trained models zip file
4. Extract to `backend/models/saved_models/` and push to GitHub

In [None]:
# Install dependencies
!pip install -q scikit-learn pandas numpy joblib lightgbm xgboost matplotlib seaborn optuna

In [None]:
# Upload your gas_data.db file
from google.colab import files
import os

print("Upload your gas_data.db file from backend/gas_data.db")
uploaded = files.upload()

if 'gas_data.db' in uploaded:
    print(f"\n✅ Uploaded gas_data.db ({len(uploaded['gas_data.db']) / 1024 / 1024:.1f} MB)")
else:
    print("❌ Please upload gas_data.db")

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load data from database
conn = sqlite3.connect('gas_data.db')
df = pd.read_sql("""
    SELECT timestamp, current_gas as gas, base_fee, priority_fee, 
           block_number, gas_used, gas_limit, utilization
    FROM gas_prices ORDER BY timestamp ASC
""", conn)
conn.close()

df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp').sort_index()

print(f"Total records: {len(df):,}")
print(f"Date range: {df.index.min()} to {df.index.max()}")

# === IMPROVED: Resample to 30-second intervals (was 1-min, losing too much data) ===
print("\nResampling to 30-second intervals (preserves more data)...")
df = df.resample('30s').mean().dropna(subset=['gas'])
print(f"After resample: {len(df):,} records")

# Find segments (gap > 30 min = new segment)
df['time_diff'] = df.index.to_series().diff()
df['segment'] = (df['time_diff'] > pd.Timedelta(minutes=30)).cumsum()

segment_sizes = df.groupby('segment').size()
print(f"\nSegments found: {len(segment_sizes)}")
print(f"Segment sizes: {segment_sizes.sort_values(ascending=False).head(10).tolist()}")

# === IMPROVED: Lower threshold from 120 to 30 minutes (keeps more segments) ===
MIN_SEGMENT_SIZE = 60  # 30 minutes at 30-sec intervals = 60 records
good_segments = segment_sizes[segment_sizes >= MIN_SEGMENT_SIZE].index.tolist()
df = df[df['segment'].isin(good_segments)]
print(f"\nKeeping {len(good_segments)} segments with >= 30 minutes of data")
print(f"Total usable records: {len(df):,}")

# === DATA SUFFICIENCY CHECK ===
MIN_REQUIRED_SAMPLES = 10000
if len(df) < MIN_REQUIRED_SAMPLES:
    print(f"\n⚠️  WARNING: Only {len(df):,} samples. Recommend at least {MIN_REQUIRED_SAMPLES:,}")
    print("   Models may underperform. Consider collecting more data.")
else:
    print(f"\n✓ Data sufficiency check passed: {len(df):,} samples")

RECORDS_PER_HOUR = 120  # 30-sec intervals = 120 records per hour

In [None]:
# Fetch ETH Price Data - IMPROVED with Binance (1-minute data)
import requests

print("="*60)
print("FETCHING EXTERNAL DATA")
print("="*60)

def fetch_eth_price_binance(start_date, end_date):
    """Fetch ETH price from Binance API (1-minute candles, much better than CoinGecko hourly)"""
    try:
        start_ts = int(start_date.timestamp() * 1000)
        end_ts = int(end_date.timestamp() * 1000)
        
        all_prices = []
        current_ts = start_ts
        
        print(f"Fetching ETH prices from Binance (1-min candles)...")
        
        while current_ts < end_ts:
            url = "https://api.binance.com/api/v3/klines"
            params = {
                'symbol': 'ETHUSDT',
                'interval': '1m',
                'startTime': current_ts,
                'endTime': min(current_ts + 1000 * 60 * 1000, end_ts),  # Max 1000 candles
                'limit': 1000
            }
            
            response = requests.get(url, params=params, timeout=30)
            
            if response.status_code == 200:
                data = response.json()
                if not data:
                    break
                    
                for candle in data:
                    all_prices.append({
                        'timestamp': pd.to_datetime(candle[0], unit='ms'),
                        'eth_price': float(candle[4]),  # Close price
                        'eth_volume': float(candle[5]),  # Volume
                        'eth_high': float(candle[2]),
                        'eth_low': float(candle[3])
                    })
                
                current_ts = data[-1][0] + 60000  # Next minute
                
                if len(all_prices) % 5000 == 0:
                    print(f"  Fetched {len(all_prices):,} candles...")
            else:
                print(f"  Binance API error: {response.status_code}")
                break
        
        if all_prices:
            eth_df = pd.DataFrame(all_prices)
            eth_df = eth_df.set_index('timestamp')
            print(f"  Total: {len(eth_df):,} 1-minute ETH candles")
            return eth_df
        return None
        
    except Exception as e:
        print(f"  Failed to fetch from Binance: {e}")
        return None

def fetch_eth_price_coingecko(start_date, end_date):
    """Fallback: CoinGecko API (hourly data)"""
    try:
        start_ts = int(start_date.timestamp())
        end_ts = int(end_date.timestamp())
        
        url = "https://api.coingecko.com/api/v3/coins/ethereum/market_chart/range"
        params = {'vs_currency': 'usd', 'from': start_ts, 'to': end_ts}
        
        print(f"Fallback: Fetching from CoinGecko (hourly)...")
        response = requests.get(url, params=params, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            prices = data.get('prices', [])
            
            eth_df = pd.DataFrame(prices, columns=['timestamp', 'eth_price'])
            eth_df['timestamp'] = pd.to_datetime(eth_df['timestamp'], unit='ms')
            eth_df = eth_df.set_index('timestamp')
            eth_df['eth_volume'] = np.nan
            eth_df['eth_high'] = eth_df['eth_price']
            eth_df['eth_low'] = eth_df['eth_price']
            
            print(f"  Fetched {len(eth_df)} hourly ETH prices")
            return eth_df
        return None
    except Exception as e:
        print(f"  CoinGecko failed: {e}")
        return None

# Try Binance first, fallback to CoinGecko
eth_data = fetch_eth_price_binance(df.index.min(), df.index.max())
if eth_data is None or len(eth_data) < 100:
    eth_data = fetch_eth_price_coingecko(df.index.min(), df.index.max())

has_eth_data = False
if eth_data is not None and len(eth_data) > 0:
    # Resample to 30-second intervals
    eth_data = eth_data.resample('30s').ffill()
    
    # Merge with gas data
    df = df.join(eth_data, how='left')
    df['eth_price'] = df['eth_price'].ffill().bfill()
    
    # Fill other ETH columns
    for col in ['eth_volume', 'eth_high', 'eth_low']:
        if col in df.columns:
            df[col] = df[col].ffill().bfill()
    
    eth_coverage = df['eth_price'].notna().mean()
    print(f"  ETH price coverage: {eth_coverage:.1%}")
    
    if eth_coverage > 0.5:
        has_eth_data = True
        print("  ✓ ETH price data integrated (1-min resolution)")
else:
    print("  ⚠️ No ETH price data available")
    df['eth_price'] = np.nan
    df['eth_volume'] = np.nan
    df['eth_high'] = np.nan
    df['eth_low'] = np.nan

HAS_ETH_PRICE = has_eth_data

In [None]:
# Feature Engineering - SIMPLIFIED v3 + SPIKE-ADJUSTED TARGETS
# Focus: 15-20 high-value features to prevent overfitting
# NEW: Option for log-transformed or winsorized targets

print("Engineering SIMPLIFIED feature set (15-20 features)...")

# === CONFIGURATION ===
TARGET_TRANSFORM = "log"  # Options: "none", "log", "winsorize"
WINSORIZE_PERCENTILE = 0.95  # For winsorize: cap at this percentile

def engineer_features_for_segment(seg_df, has_eth=False, horizon='all'):
    """Engineer focused feature set - quality over quantity"""
    df = seg_df.copy()
    rph = 120  # records per hour (30-sec intervals)
    
    # === TIME FEATURES (3 features) ===
    df['hour'] = df.index.hour
    hour_of_day = df.index.hour + df.index.minute / 60
    df['hour_sin'] = np.sin(2 * np.pi * hour_of_day / 24)
    df['hour_cos'] = np.cos(2 * np.pi * hour_of_day / 24)
    
    # === ETH FEATURES (2 features) ===
    if has_eth and 'eth_price' in df.columns and df['eth_price'].notna().any():
        df['eth_log'] = np.log1p(df['eth_price'])
        eth_mean = df['eth_price'].rolling(4*rph, min_periods=rph).mean()
        eth_std = df['eth_price'].rolling(4*rph, min_periods=rph).std()
        df['eth_zscore_4h'] = np.where(eth_std > 0.01, (df['eth_price'] - eth_mean) / eth_std, 0)
        df['gas_eth_corr_1h'] = df['gas'].rolling(rph, min_periods=rph//2).corr(df['eth_price']).fillna(0)
    
    # === NETWORK UTILIZATION (2 features) ===
    if 'utilization' in df.columns:
        df['util_mean_1h'] = df['utilization'].rolling(rph, min_periods=rph//2).mean()
        df['util_mean_2h'] = df['utilization'].rolling(2*rph, min_periods=rph).mean()
    
    # === GAS LAG FEATURES (5 features) ===
    df['gas_lag_5min'] = df['gas'].shift(10)
    df['gas_lag_15min'] = df['gas'].shift(30)
    df['gas_lag_30min'] = df['gas'].shift(60)
    df['gas_lag_1h'] = df['gas'].shift(rph)
    df['gas_lag_4h'] = df['gas'].shift(4*rph)
    
    # === ROLLING STATS (6 features) ===
    df['gas_mean_1h'] = df['gas'].rolling(rph, min_periods=rph//2).mean()
    df['gas_std_1h'] = df['gas'].rolling(rph, min_periods=rph//2).std()
    df['gas_cv_1h'] = np.where(df['gas_mean_1h'] > 0.01, 
                                df['gas_std_1h'] / df['gas_mean_1h'], 0)
    df['gas_mean_2h'] = df['gas'].rolling(2*rph, min_periods=rph).mean()
    df['gas_mean_4h'] = df['gas'].rolling(4*rph, min_periods=rph).mean()
    
    # === MOMENTUM (3 features) ===
    df['momentum_1h'] = df['gas'] - df['gas'].shift(rph)
    shift_2h = df['gas'].shift(2*rph)
    df['momentum_pct_2h'] = np.where(shift_2h > 0.01, (df['gas'] - shift_2h) / shift_2h, 0)
    df['trend_1h_4h'] = np.where(df['gas_mean_4h'] > 0.01, df['gas_mean_1h'] / df['gas_mean_4h'], 1.0)
    
    # === Z-SCORE AND REGIME (3 features) ===
    df['gas_zscore_1h'] = np.where(df['gas_std_1h'] > 0.001, 
        (df['gas'] - df['gas_mean_1h']) / df['gas_std_1h'], 0)
    df['is_spike'] = (df['gas'] > df['gas_mean_1h'] + 2 * df['gas_std_1h']).astype(int)
    df['is_high_gas'] = (df['gas'] > df['gas'].rolling(4*rph, min_periods=rph).quantile(0.9)).astype(int)
    
    return df

# Process each segment
print("\nProcessing segments...")
segments = df['segment'].unique()
processed_segments = []

for seg_id in segments:
    seg_df = df[df['segment'] == seg_id].copy()
    processed = engineer_features_for_segment(seg_df, has_eth=has_eth_data, horizon='all')
    processed_segments.append(processed)

df_features = pd.concat(processed_segments, axis=0)
print(f"After feature engineering: {len(df_features):,} records")

# Create targets
print("\nCreating prediction targets...")

def create_targets_for_segment(seg_df, transform="none", winsorize_pct=0.95):
    """Create target variables with optional transformation"""
    df = seg_df.copy()
    rph = 120
    
    # Raw future prices
    raw_1h = df['gas'].shift(-rph)
    raw_4h = df['gas'].shift(-4*rph)
    raw_24h = df['gas'].shift(-24*rph)
    
    # Apply transformation
    if transform == "log":
        # Log transform - better for multiplicative changes
        df['target_1h'] = np.log1p(raw_1h)
        df['target_4h'] = np.log1p(raw_4h)
        df['target_24h'] = np.log1p(raw_24h)
        # Also store raw for evaluation
        df['target_1h_raw'] = raw_1h
        df['target_4h_raw'] = raw_4h
        df['target_24h_raw'] = raw_24h
    elif transform == "winsorize":
        # Winsorize - cap extreme values
        cap_1h = raw_1h.quantile(winsorize_pct)
        cap_4h = raw_4h.quantile(winsorize_pct)
        cap_24h = raw_24h.quantile(winsorize_pct) if raw_24h.notna().sum() > 100 else cap_4h
        df['target_1h'] = raw_1h.clip(upper=cap_1h)
        df['target_4h'] = raw_4h.clip(upper=cap_4h)
        df['target_24h'] = raw_24h.clip(upper=cap_24h)
        df['target_1h_raw'] = raw_1h
        df['target_4h_raw'] = raw_4h
        df['target_24h_raw'] = raw_24h
        print(f"  Winsorized caps: 1h={cap_1h:.2f}, 4h={cap_4h:.2f}")
    else:
        # No transform
        df['target_1h'] = raw_1h
        df['target_4h'] = raw_4h
        df['target_24h'] = raw_24h
    
    # Direction classification (always on raw)
    threshold = 0.02
    for horizon in ['1h', '4h']:
        raw_target = raw_1h if horizon == '1h' else raw_4h
        pct_change = np.where(df['gas'] > 0.001, 
            (raw_target - df['gas']) / df['gas'], 0)
        df[f'direction_class_{horizon}'] = pd.cut(
            pct_change,
            bins=[-float('inf'), -threshold, threshold, float('inf')],
            labels=['down', 'stable', 'up']
        )
    
    return df

print(f"Target transform: {TARGET_TRANSFORM}")
processed_with_targets = []
for seg_id in df_features['segment'].unique():
    seg_df = df_features[df_features['segment'] == seg_id].copy()
    processed = create_targets_for_segment(seg_df, transform=TARGET_TRANSFORM, winsorize_pct=WINSORIZE_PERCENTILE)
    processed_with_targets.append(processed)

df_features = pd.concat(processed_with_targets, axis=0)

# Store transform info for later use
TARGET_TRANSFORM_USED = TARGET_TRANSFORM

# === CLEAN INF/NAN VALUES ===
print("\nCleaning inf/nan values...")
numeric_cols = df_features.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    df_features[col] = df_features[col].replace([np.inf, -np.inf], np.nan)
    if df_features[col].notna().sum() > 0:
        q_low = df_features[col].quantile(0.001)
        q_high = df_features[col].quantile(0.999)
        df_features[col] = df_features[col].clip(q_low, q_high)

df_features = df_features.ffill().bfill()

for col in numeric_cols:
    if df_features[col].isna().any():
        median_val = df_features[col].median()
        if pd.isna(median_val):
            median_val = 0
        df_features[col] = df_features[col].fillna(median_val)

inf_count = np.isinf(df_features.select_dtypes(include=[np.number])).sum().sum()
nan_count = df_features.select_dtypes(include=[np.number]).isna().sum().sum()
print(f"  After cleaning: {inf_count} inf, {nan_count} nan values")

# === DEFINE FOCUSED FEATURE SET ===
CORE_FEATURES = [
    'hour', 'hour_sin', 'hour_cos',
    'eth_log', 'eth_zscore_4h', 'gas_eth_corr_1h',
    'util_mean_1h', 'util_mean_2h',
    'gas_lag_5min', 'gas_lag_15min', 'gas_lag_30min', 'gas_lag_1h', 'gas_lag_4h',
    'gas_mean_1h', 'gas_std_1h', 'gas_cv_1h', 'gas_mean_2h', 'gas_mean_4h',
    'momentum_1h', 'momentum_pct_2h', 'trend_1h_4h',
    'gas_zscore_1h', 'is_spike', 'is_high_gas'
]

available_features = [f for f in CORE_FEATURES if f in df_features.columns]
features_1h = available_features
features_4h = available_features  
features_24h = available_features

print(f"\n✓ Focused feature set: {len(available_features)} features")
print(f"  Features: {', '.join(available_features)}")


In [None]:
# Prepare training data - with AUTO-ADAPT to distribution shift
from sklearn.preprocessing import RobustScaler
from scipy import stats

# === CONFIGURATION ===
USE_ROLLING_WINDOW = False  # Set True to use only recent data (AUTO-ENABLED if shift detected)
ROLLING_WINDOW_DAYS = 7     # Days of data to use if rolling window enabled
HOLDOUT_HOURS = 48          # Hours to reserve for holdout
AUTO_ADAPT_ON_SHIFT = True  # Automatically adapt when distribution shift detected

# Only keep numeric columns
numeric_features_1h = df_features[features_1h].select_dtypes(include=[np.number]).columns.tolist()
numeric_features_4h = df_features[features_4h].select_dtypes(include=[np.number]).columns.tolist()
numeric_features_24h = df_features[features_24h].select_dtypes(include=[np.number]).columns.tolist()

print(f"Numeric features: 1h={len(numeric_features_1h)}, 4h={len(numeric_features_4h)}, 24h={len(numeric_features_24h)}")

# Drop rows only where TARGET columns are NaN
target_cols = ['target_1h', 'target_4h']
df_clean = df_features.dropna(subset=target_cols)
print(f"Clean samples (with valid 1h/4h targets): {len(df_clean):,}")

valid_24h = df_features['target_24h'].notna().sum()
print(f"Samples with valid 24h target: {valid_24h:,}")

# === OUT-OF-TIME HOLDOUT (do this FIRST to detect shift) ===
rph = 120  # records per hour
holdout_size = HOLDOUT_HOURS * rph

if len(df_clean) > holdout_size + 5000:
    df_train_val_initial = df_clean.iloc[:-holdout_size]
    df_holdout = df_clean.iloc[-holdout_size:]
    print(f"\n✓ Out-of-time holdout: {len(df_holdout):,} samples (last {HOLDOUT_HOURS}h)")
    HAS_HOLDOUT = True
else:
    df_train_val_initial = df_clean
    df_holdout = None
    print(f"\n⚠️ Not enough data for holdout, using all for training")
    HAS_HOLDOUT = False

# === DISTRIBUTION SHIFT DETECTION ===
def detect_distribution_shift(train_data, holdout_data, name=""):
    """Detect distribution shift between train and holdout"""
    results = {'name': name, 'warnings': [], 'passed': True, 'shift_magnitude': 0}
    
    train_mean, train_std = train_data.mean(), train_data.std()
    holdout_mean = holdout_data.mean()
    mean_shift = abs(holdout_mean - train_mean) / (train_std + 1e-8)
    results['mean_shift_std'] = mean_shift
    
    if mean_shift > 1.0:
        results['warnings'].append(f"Large mean shift: {mean_shift:.2f} std devs")
        results['passed'] = False
        results['shift_magnitude'] = max(results['shift_magnitude'], mean_shift)
    elif mean_shift > 0.5:
        results['warnings'].append(f"Moderate mean shift: {mean_shift:.2f} std devs")
    
    var_ratio = holdout_data.var() / (train_data.var() + 1e-8)
    results['var_ratio'] = var_ratio
    
    if var_ratio > 4 or var_ratio < 0.25:
        results['warnings'].append(f"Large variance change: {var_ratio:.2f}x")
        results['passed'] = False
        results['shift_magnitude'] = max(results['shift_magnitude'], abs(np.log(var_ratio)))
    
    ks_stat, ks_pval = stats.ks_2samp(train_data.values[:5000], holdout_data.values[:5000])
    results['ks_statistic'] = ks_stat
    results['ks_pvalue'] = ks_pval
    
    if ks_pval < 0.001 and ks_stat > 0.3:
        results['warnings'].append(f"KS test: distributions differ significantly")
        results['passed'] = False
        results['shift_magnitude'] = max(results['shift_magnitude'], ks_stat * 3)
    
    train_spikes = (train_data > train_data.quantile(0.95)).mean()
    holdout_spikes = (holdout_data > train_data.quantile(0.95)).mean()
    spike_ratio = holdout_spikes / (train_spikes + 1e-8)
    results['spike_ratio'] = spike_ratio
    
    if spike_ratio > 3:
        results['warnings'].append(f"Spike frequency {spike_ratio:.1f}x higher in holdout")
        results['passed'] = False
        results['shift_magnitude'] = max(results['shift_magnitude'], spike_ratio / 2)
    
    return results

DISTRIBUTION_SHIFT_DETECTED = False
SHIFT_MAGNITUDE = 0

if HAS_HOLDOUT:
    print(f"\n{'='*60}")
    print("DISTRIBUTION SHIFT DETECTION")
    print(f"{'='*60}")
    
    for horizon in ['1h', '4h']:
        target_col = f'target_{horizon}'
        train_targets = df_train_val_initial[target_col].dropna()
        holdout_targets = df_holdout[target_col].dropna()
        
        shift_result = detect_distribution_shift(train_targets, holdout_targets, f"{horizon} target")
        
        status = "✓ OK" if shift_result['passed'] else "⚠️ SHIFT DETECTED"
        print(f"\n{horizon}: {status}")
        print(f"  Train:   mean={train_targets.mean():.4f}, std={train_targets.std():.4f}")
        print(f"  Holdout: mean={holdout_targets.mean():.4f}, std={holdout_targets.std():.4f}")
        print(f"  Mean shift: {shift_result['mean_shift_std']:.2f} std, Var ratio: {shift_result['var_ratio']:.2f}x")
        
        if shift_result['warnings']:
            for w in shift_result['warnings']:
                print(f"  ⚠️ {w}")
            DISTRIBUTION_SHIFT_DETECTED = True
            SHIFT_MAGNITUDE = max(SHIFT_MAGNITUDE, shift_result['shift_magnitude'])

# === AUTO-ADAPT TO DISTRIBUTION SHIFT ===
if DISTRIBUTION_SHIFT_DETECTED and AUTO_ADAPT_ON_SHIFT:
    print(f"\n{'='*60}")
    print("AUTO-ADAPTING TO DISTRIBUTION SHIFT")
    print(f"{'='*60}")
    
    # Calculate adaptive window based on shift magnitude
    if SHIFT_MAGNITUDE > 2:
        adaptive_days = 3  # Severe shift - use very recent data
    elif SHIFT_MAGNITUDE > 1:
        adaptive_days = 5  # Moderate shift
    else:
        adaptive_days = 7  # Mild shift
    
    window_samples = adaptive_days * 24 * rph
    
    if len(df_train_val_initial) > window_samples:
        df_train_val = df_train_val_initial.iloc[-window_samples:]
        print(f"✓ Auto-enabled rolling window: {adaptive_days} days ({len(df_train_val):,} samples)")
        print(f"  Shift magnitude: {SHIFT_MAGNITUDE:.2f} → window: {adaptive_days} days")
        USE_ROLLING_WINDOW = True
        ROLLING_WINDOW_DAYS = adaptive_days
    else:
        df_train_val = df_train_val_initial
        print(f"⚠️ Not enough data for adaptive window, using all training data")
elif USE_ROLLING_WINDOW:
    # Manual rolling window
    window_samples = ROLLING_WINDOW_DAYS * 24 * rph
    if len(df_train_val_initial) > window_samples:
        df_train_val = df_train_val_initial.iloc[-window_samples:]
        print(f"\n✓ Rolling window: Using last {ROLLING_WINDOW_DAYS} days ({len(df_train_val):,} samples)")
    else:
        df_train_val = df_train_val_initial
else:
    df_train_val = df_train_val_initial

print(f"\nFinal training set: {len(df_train_val):,} samples")

# Final safety check
for col in df_train_val.select_dtypes(include=[np.float64, np.float32, float]).columns:
    df_train_val[col] = df_train_val[col].replace([np.inf, -np.inf], np.nan)
    if df_train_val[col].isna().any():
        df_train_val[col] = df_train_val[col].fillna(df_train_val[col].median())

float_cols = df_train_val.select_dtypes(include=[np.float64, np.float32, float]).columns
has_inf = any(np.isinf(df_train_val[col]).any() for col in float_cols)
has_nan = any(np.isnan(df_train_val[col]).any() for col in float_cols)
assert not has_inf, "Data still contains inf!"
assert not has_nan, "Data still contains nan!"
print("✓ Data validated: no inf/nan values")

# Prepare feature matrices
X_1h = df_train_val[numeric_features_1h]
X_4h = df_train_val[numeric_features_4h]
X_24h = df_train_val[numeric_features_24h]

y_1h = df_train_val['target_1h']
y_4h = df_train_val['target_4h']
y_24h = df_train_val['target_24h']

y_dir_1h = df_train_val['direction_class_1h']
y_dir_4h = df_train_val['direction_class_4h']

current_gas = df_train_val['gas']

# === BASELINE MODELS (on both train AND holdout) ===
print(f"\n{'='*60}")
print("BASELINE COMPARISONS")
print(f"{'='*60}")

naive_mae_1h = np.mean(np.abs(y_1h.values - current_gas.values))
naive_mae_4h = np.mean(np.abs(y_4h.values - current_gas.values))

mean_pred = np.full_like(y_1h.values, y_1h.mean())
mean_mae_1h = np.mean(np.abs(y_1h.values - mean_pred))
mean_mae_4h = np.mean(np.abs(y_4h.values - mean_pred))

print(f"\nTRAINING SET Baseline MAEs:")
print(f"  Naive (current price):     MAE_1h={naive_mae_1h:.6f}, MAE_4h={naive_mae_4h:.6f}")
print(f"  Mean (historical average): MAE_1h={mean_mae_1h:.6f}, MAE_4h={mean_mae_4h:.6f}")

best_baseline_1h = min(naive_mae_1h, mean_mae_1h)
best_baseline_4h = min(naive_mae_4h, mean_mae_4h)

BASELINES = {
    '1h': {'naive_mae': naive_mae_1h, 'mean_mae': mean_mae_1h, 'best': best_baseline_1h},
    '4h': {'naive_mae': naive_mae_4h, 'mean_mae': mean_mae_4h, 'best': best_baseline_4h}
}

# === HOLDOUT BASELINES ===
if HAS_HOLDOUT:
    print(f"\nHOLDOUT SET Baseline MAEs:")
    holdout_gas = df_holdout['gas']
    
    for horizon in ['1h', '4h']:
        holdout_target = df_holdout[f'target_{horizon}'].dropna()
        holdout_current = holdout_gas.loc[holdout_target.index]
        
        holdout_naive_mae = np.mean(np.abs(holdout_target.values - holdout_current.values))
        train_mean = df_train_val[f'target_{horizon}'].mean()
        holdout_mean_mae = np.mean(np.abs(holdout_target.values - train_mean))
        
        holdout_best = min(holdout_naive_mae, holdout_mean_mae)
        BASELINES[horizon]['holdout_naive_mae'] = holdout_naive_mae
        BASELINES[horizon]['holdout_mean_mae'] = holdout_mean_mae
        BASELINES[horizon]['holdout_best'] = holdout_best
        
        print(f"  {horizon}: Naive={holdout_naive_mae:.6f}, Mean={holdout_mean_mae:.6f}, Best={holdout_best:.6f}")
    
    for horizon in ['1h', '4h']:
        train_best = BASELINES[horizon]['best']
        holdout_best = BASELINES[horizon]['holdout_best']
        ratio = holdout_best / (train_best + 1e-8)
        if ratio > 2:
            print(f"\n  ⚠️ {horizon}: Holdout baseline {ratio:.1f}x worse than train - regime change!")

FEATURE_IMPORTANCE = {}

print(f"\n{'='*60}")
print("TRAINING DATA SUMMARY")
print(f"{'='*60}")
print(f"Training samples: {len(df_train_val):,}")
print(f"Holdout samples: {len(df_holdout) if df_holdout is not None else 0:,}")
print(f"Features: {len(numeric_features_1h)}")
print(f"Distribution shift: {DISTRIBUTION_SHIFT_DETECTED} (magnitude: {SHIFT_MAGNITUDE:.2f})")
print(f"Auto-adapt enabled: {AUTO_ADAPT_ON_SHIFT}")
if USE_ROLLING_WINDOW:
    print(f"Rolling window: {ROLLING_WINDOW_DAYS} days")


In [None]:
# Model Training - WITH PERMUTATION IMPORTANCE
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import joblib
import warnings
warnings.filterwarnings('ignore')

# === CONFIGURATION ===
TRAIN_REGIME_MODELS = True
MINIMUM_IMPROVEMENT = 0.05
HOLDOUT_DEGRADATION_LIMIT = 0.30
COMPUTE_PERMUTATION_IMPORTANCE = True  # Compute for all models

def check_baseline_gate(model_mae, baseline_mae, model_name):
    """Check if model beats baseline by minimum threshold"""
    improvement = (baseline_mae - model_mae) / baseline_mae
    passed = improvement >= MINIMUM_IMPROVEMENT
    if passed:
        print(f"  ✓ PASSED baseline gate: {improvement*100:.1f}% improvement")
    else:
        print(f"  ✗ FAILED baseline gate: {improvement*100:.1f}% (need {MINIMUM_IMPROVEMENT*100:.0f}%+)")
    return passed, improvement

def check_holdout_gate(cv_mae, holdout_mae, model_name, holdout_baseline=None):
    """Check if holdout performance is acceptable"""
    if cv_mae <= 0:
        return False, 0
    degradation = (holdout_mae - cv_mae) / cv_mae
    
    if holdout_baseline is not None:
        holdout_improvement = (holdout_baseline - holdout_mae) / holdout_baseline
        if holdout_improvement >= 0:
            print(f"  ✓ Beats holdout baseline by {holdout_improvement*100:.1f}%")
            return True, degradation
    
    passed = degradation < HOLDOUT_DEGRADATION_LIMIT
    if passed:
        print(f"  ✓ PASSED holdout gate: {degradation*100:+.1f}% degradation")
    else:
        print(f"  ✗ FAILED holdout gate: {degradation*100:+.1f}% degradation (limit: {HOLDOUT_DEGRADATION_LIMIT*100}%)")
    return passed, degradation

def walk_forward_validate(model_class, model_params, X, y, baseline_mae, n_splits=5, purge_gap=120):
    """Walk-forward validation with purge gap"""
    n = len(X)
    fold_size = n // (n_splits + 1)
    fold_results = []
    
    for fold in range(n_splits):
        train_end = fold_size * (fold + 1)
        test_start = train_end + purge_gap
        test_end = test_start + fold_size
        
        if test_end > n:
            break
            
        X_train = X.iloc[:train_end]
        X_test = X.iloc[test_start:test_end]
        y_train = y.iloc[:train_end]
        y_test = y.iloc[test_start:test_end]
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = model_class(**model_params)
        model.fit(X_train_scaled, y_train)
        
        y_pred = model.predict(X_test_scaled)
        mae = mean_absolute_error(y_test, y_pred)
        fold_results.append(mae)
    
    if not fold_results:
        return None
        
    return {
        'avg_mae': np.mean(fold_results),
        'std_mae': np.std(fold_results),
        'improvement': (baseline_mae - np.mean(fold_results)) / baseline_mae
    }

def get_models_to_try():
    """Get list of simple, robust models"""
    return [
        ('Ridge', Ridge, {'alpha': 1.0, 'random_state': 42}),
        ('Huber', HuberRegressor, {'epsilon': 1.35, 'alpha': 0.1, 'max_iter': 1000}),
        ('RF', RandomForestRegressor, {
            'n_estimators': 30, 'max_depth': 4, 'min_samples_leaf': 20,
            'random_state': 42, 'n_jobs': -1
        }),
        ('GBM', GradientBoostingRegressor, {
            'n_estimators': 30, 'max_depth': 3, 'learning_rate': 0.1,
            'min_samples_leaf': 20, 'random_state': 42
        }),
    ]

def compute_permutation_importance(model, X, y, scaler, feature_names, n_repeats=5):
    """Compute permutation importance for any model type"""
    X_scaled = scaler.transform(X)
    
    # Use sklearn's permutation_importance
    result = permutation_importance(
        model, X_scaled, y,
        n_repeats=n_repeats,
        random_state=42,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    
    # Convert to dictionary (higher = more important)
    importance_dict = {}
    for i, feat in enumerate(feature_names):
        # Negative because we use neg_mae, so more negative = worse = more important
        importance_dict[feat] = -result.importances_mean[i]
    
    # Normalize to sum to 1
    total = sum(importance_dict.values())
    if total > 0:
        importance_dict = {k: v/total for k, v in importance_dict.items()}
    
    return importance_dict

def train_model_with_holdout(X_train, y_train, X_holdout, y_holdout, baseline_mae, 
                             horizon_name, feature_names, holdout_baseline=None):
    """Train model and select based on HOLDOUT performance"""
    print(f"\n{'='*60}")
    print(f"Training {horizon_name} model")
    print(f"{'='*60}")
    print(f"Train: {len(X_train):,}, Holdout: {len(X_holdout):,}, Features: {X_train.shape[1]}")
    print(f"Train baseline: {baseline_mae:.6f}", end="")
    if holdout_baseline:
        print(f", Holdout baseline: {holdout_baseline:.6f}")
    else:
        print()
    
    models_to_try = get_models_to_try()
    results = []
    
    for name, model_class, params in models_to_try:
        print(f"\n[{name}]")
        try:
            wf_result = walk_forward_validate(model_class, params, X_train, y_train, baseline_mae, n_splits=4, purge_gap=120)
            if not wf_result:
                continue
                
            cv_mae = wf_result['avg_mae']
            print(f"  CV MAE: {cv_mae:.6f} ± {wf_result['std_mae']:.6f}")
            
            scaler = RobustScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_holdout_scaled = scaler.transform(X_holdout)
            
            model = model_class(**params)
            model.fit(X_train_scaled, y_train)
            
            y_holdout_pred = model.predict(X_holdout_scaled)
            holdout_mae = mean_absolute_error(y_holdout, y_holdout_pred)
            holdout_improvement = (baseline_mae - holdout_mae) / baseline_mae
            
            # Calculate vs holdout baseline
            if holdout_baseline:
                vs_holdout = (holdout_baseline - holdout_mae) / holdout_baseline
                print(f"  HOLDOUT MAE: {holdout_mae:.6f} ({vs_holdout*100:+.1f}% vs holdout baseline)")
            else:
                print(f"  HOLDOUT MAE: {holdout_mae:.6f} ({holdout_improvement*100:+.1f}% vs train baseline)")
            
            use_baseline = holdout_baseline if holdout_baseline else baseline_mae
            passed_baseline, _ = check_baseline_gate(holdout_mae, use_baseline, name)
            passed_holdout, degradation = check_holdout_gate(cv_mae, holdout_mae, name, holdout_baseline)
            
            if passed_baseline or (passed_holdout and holdout_improvement > 0):
                results.append({
                    'name': name, 'model_class': model_class, 'params': params,
                    'cv_mae': cv_mae, 'holdout_mae': holdout_mae,
                    'holdout_improvement': holdout_improvement,
                    'vs_holdout_baseline': (holdout_baseline - holdout_mae) / holdout_baseline if holdout_baseline else None,
                    'model': model, 'scaler': scaler
                })
                print(f"  → Accepted")
            else:
                print(f"  → Rejected")
                
        except Exception as e:
            print(f"  Failed: {e}")
    
    if not results:
        print("\n⚠️ All models failed! Using Huber fallback...")
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        model = HuberRegressor(epsilon=1.35, alpha=0.1, max_iter=1000)
        model.fit(X_train_scaled, y_train)
        
        y_holdout_pred = model.predict(scaler.transform(X_holdout))
        holdout_mae = mean_absolute_error(y_holdout, y_holdout_pred)
        
        importance = {}
        if COMPUTE_PERMUTATION_IMPORTANCE:
            print("  Computing permutation importance...")
            importance = compute_permutation_importance(model, X_holdout, y_holdout, scaler, feature_names)
        
        return model, scaler, {
            'name': 'Huber (fallback)',
            'mae': holdout_mae,
            'improvement': (baseline_mae - holdout_mae) / baseline_mae,
            'vs_holdout_baseline': (holdout_baseline - holdout_mae) / holdout_baseline if holdout_baseline else None,
            'passed_baseline': False,
            'is_fallback': True
        }, importance
    
    best = min(results, key=lambda x: x['holdout_mae'])
    print(f"\n>>> Best: {best['name']} (Holdout MAE: {best['holdout_mae']:.6f})")
    
    # Compute permutation importance for the best model
    importance = {}
    if COMPUTE_PERMUTATION_IMPORTANCE:
        print("  Computing permutation importance...")
        importance = compute_permutation_importance(best['model'], X_holdout, y_holdout, best['scaler'], feature_names)
        top_3 = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"  Top features: {', '.join([f'{f[0]}({f[1]:.2f})' for f in top_3])}")
    elif hasattr(best['model'], 'feature_importances_'):
        importance = dict(zip(feature_names, best['model'].feature_importances_))
    elif hasattr(best['model'], 'coef_'):
        importance = dict(zip(feature_names, np.abs(best['model'].coef_)))
    
    return best['model'], best['scaler'], {
        'name': best['name'],
        'mae': best['holdout_mae'],
        'cv_mae': best['cv_mae'],
        'improvement': best['holdout_improvement'],
        'vs_holdout_baseline': best['vs_holdout_baseline'],
        'passed_baseline': True,
        'is_fallback': False
    }, importance

def train_regime_models(X_train, y_train, X_holdout, y_holdout, regime_train, regime_holdout,
                        baseline_mae, horizon_name, feature_names, holdout_baseline=None):
    """Train separate models for each regime"""
    print(f"\n{'='*60}")
    print(f"Training REGIME-SPECIFIC {horizon_name} models")
    print(f"{'='*60}")
    
    regime_models = {}
    
    for regime_val, regime_name in [(0, 'normal'), (1, 'elevated'), (2, 'spike')]:
        train_mask = regime_train == regime_val
        holdout_mask = regime_holdout == regime_val
        
        n_train = train_mask.sum()
        n_holdout = holdout_mask.sum()
        
        print(f"\n[{regime_name.upper()}] Train: {n_train}, Holdout: {n_holdout}")
        
        if n_train < 500 or n_holdout < 100:
            print(f"  Insufficient data, skipping")
            continue
        
        X_r_train = X_train[train_mask]
        y_r_train = y_train[train_mask]
        X_r_holdout = X_holdout[holdout_mask]
        y_r_holdout = y_holdout[holdout_mask]
        
        model, scaler, metrics, importance = train_model_with_holdout(
            X_r_train, y_r_train, X_r_holdout, y_r_holdout,
            baseline_mae, f"{horizon_name}_{regime_name}", feature_names, holdout_baseline
        )
        
        if model:
            regime_models[regime_val] = {
                'model': model, 'scaler': scaler, 'metrics': metrics,
                'regime_name': regime_name, 'n_samples': n_train
            }
    
    return regime_models

def print_distribution_diagnostics(y_train, y_holdout, name=""):
    """Print diagnostics"""
    print(f"\n[Distribution - {name}]")
    print(f"  Train:   mean={y_train.mean():.4f}, std={y_train.std():.4f}")
    print(f"  Holdout: mean={y_holdout.mean():.4f}, std={y_holdout.std():.4f}")


In [None]:
# Train all models with ENSEMBLE REGIME SWITCHING
print("="*70)
print("TRAINING ALL PREDICTION MODELS")
print("="*70)

trained_models = {}
regime_specific_models = {}
all_feature_importance = {}

if not HAS_HOLDOUT or df_holdout is None or len(df_holdout) < 1000:
    print("\n⚠️ WARNING: Limited holdout data")

print(f"\nTraining set: {len(df_train_val):,} samples")
print(f"Holdout set:  {len(df_holdout) if df_holdout is not None else 0:,} samples")
if DISTRIBUTION_SHIFT_DETECTED:
    print(f"⚠️ Distribution shift detected (magnitude: {SHIFT_MAGNITUDE:.2f})")
    if USE_ROLLING_WINDOW:
        print(f"   Auto-adapted to {ROLLING_WINDOW_DAYS}-day rolling window")

# === CREATE REGIME LABELS ===
print("\nCreating regime labels...")
regime_train = pd.Series(0, index=df_train_val.index)
if 'gas_zscore_1h' in df_train_val.columns:
    regime_train[df_train_val['gas_zscore_1h'] > 1] = 1
if 'is_spike' in df_train_val.columns:
    regime_train[df_train_val['is_spike'] == 1] = 2

regime_holdout = None
if HAS_HOLDOUT:
    regime_holdout = pd.Series(0, index=df_holdout.index)
    if 'gas_zscore_1h' in df_holdout.columns:
        regime_holdout[df_holdout['gas_zscore_1h'] > 1] = 1
    if 'is_spike' in df_holdout.columns:
        regime_holdout[df_holdout['is_spike'] == 1] = 2

print(f"Regime distribution (train): {dict(regime_train.value_counts().sort_index())}")
if regime_holdout is not None:
    print(f"Regime distribution (holdout): {dict(regime_holdout.value_counts().sort_index())}")

# === ENSEMBLE PREDICTION FUNCTION ===
def create_ensemble_predictor(global_model, global_scaler, regime_models, features):
    """Create a predictor that uses regime-specific models when available"""
    def predict(X, current_regime=None):
        X_scaled = global_scaler.transform(X)
        global_pred = global_model.predict(X_scaled)
        
        if current_regime is not None and regime_models and current_regime in regime_models:
            regime_data = regime_models[current_regime]
            X_regime_scaled = regime_data['scaler'].transform(X)
            regime_pred = regime_data['model'].predict(X_regime_scaled)
            # Weighted average: 70% regime, 30% global
            return 0.7 * regime_pred + 0.3 * global_pred
        
        return global_pred
    
    return predict

# === 1H MODEL ===
print("\n" + "="*70)
print("1-HOUR MODEL")
print("="*70)

X_holdout_1h = df_holdout[numeric_features_1h] if HAS_HOLDOUT else X_1h.iloc[-1000:]
y_holdout_1h = df_holdout['target_1h'] if HAS_HOLDOUT else y_1h.iloc[-1000:]
mask_1h = y_holdout_1h.notna()
X_holdout_1h = X_holdout_1h[mask_1h]
y_holdout_1h = y_holdout_1h[mask_1h]

print_distribution_diagnostics(y_1h, y_holdout_1h, "1h targets")

holdout_baseline_1h = BASELINES['1h'].get('holdout_best', None)

model_1h, scaler_1h, metrics_1h, importance_1h = train_model_with_holdout(
    X_1h, y_1h, X_holdout_1h, y_holdout_1h,
    BASELINES['1h']['best'], '1h', numeric_features_1h, holdout_baseline_1h
)
if model_1h:
    trained_models['1h'] = {
        'model': model_1h, 'scaler': scaler_1h, 
        'metrics': metrics_1h, 'features': numeric_features_1h
    }
    if importance_1h:
        all_feature_importance['1h'] = importance_1h

# Train regime-specific models
if TRAIN_REGIME_MODELS and regime_holdout is not None:
    regime_holdout_1h = regime_holdout[mask_1h]
    regime_models_1h = train_regime_models(
        X_1h, y_1h, X_holdout_1h, y_holdout_1h,
        regime_train, regime_holdout_1h,
        BASELINES['1h']['best'], '1h', numeric_features_1h, holdout_baseline_1h
    )
    if regime_models_1h:
        regime_specific_models['1h'] = regime_models_1h
        # Create ensemble predictor
        trained_models['1h']['ensemble_predict'] = create_ensemble_predictor(
            model_1h, scaler_1h, regime_models_1h, numeric_features_1h
        )

# === 4H MODEL ===
print("\n" + "="*70)
print("4-HOUR MODEL")
print("="*70)

X_holdout_4h = df_holdout[numeric_features_4h] if HAS_HOLDOUT else X_4h.iloc[-1000:]
y_holdout_4h = df_holdout['target_4h'] if HAS_HOLDOUT else y_4h.iloc[-1000:]
mask_4h = y_holdout_4h.notna()
X_holdout_4h = X_holdout_4h[mask_4h]
y_holdout_4h = y_holdout_4h[mask_4h]

print_distribution_diagnostics(y_4h, y_holdout_4h, "4h targets")

holdout_baseline_4h = BASELINES['4h'].get('holdout_best', None)

model_4h, scaler_4h, metrics_4h, importance_4h = train_model_with_holdout(
    X_4h, y_4h, X_holdout_4h, y_holdout_4h,
    BASELINES['4h']['best'], '4h', numeric_features_4h, holdout_baseline_4h
)
if model_4h:
    trained_models['4h'] = {
        'model': model_4h, 'scaler': scaler_4h,
        'metrics': metrics_4h, 'features': numeric_features_4h
    }
    if importance_4h:
        all_feature_importance['4h'] = importance_4h

if TRAIN_REGIME_MODELS and regime_holdout is not None:
    regime_holdout_4h = regime_holdout[mask_4h]
    regime_models_4h = train_regime_models(
        X_4h, y_4h, X_holdout_4h, y_holdout_4h,
        regime_train, regime_holdout_4h,
        BASELINES['4h']['best'], '4h', numeric_features_4h, holdout_baseline_4h
    )
    if regime_models_4h:
        regime_specific_models['4h'] = regime_models_4h
        trained_models['4h']['ensemble_predict'] = create_ensemble_predictor(
            model_4h, scaler_4h, regime_models_4h, numeric_features_4h
        )

# === 24H MODEL ===
print("\n" + "="*70)
print("24-HOUR MODEL")
print("="*70)

rph = 120
total_hours = len(df_clean) / rph
total_days = total_hours / 24
print(f"Total data: {total_days:.1f} days")

if total_days >= 30:
    mask_24h_train = y_24h.notna()
    X_24h_valid = X_24h[mask_24h_train]
    y_24h_valid = y_24h[mask_24h_train]
    
    if HAS_HOLDOUT:
        y_holdout_24h = df_holdout['target_24h']
        mask_24h_holdout = y_holdout_24h.notna()
        X_holdout_24h = df_holdout[numeric_features_24h][mask_24h_holdout]
        y_holdout_24h = y_holdout_24h[mask_24h_holdout]
    else:
        X_holdout_24h = X_24h_valid.iloc[-500:]
        y_holdout_24h = y_24h_valid.iloc[-500:]
    
    if len(y_holdout_24h) > 100:
        model_24h, scaler_24h, metrics_24h, _ = train_model_with_holdout(
            X_24h_valid, y_24h_valid, X_holdout_24h, y_holdout_24h,
            BASELINES['4h']['best'], '24h', numeric_features_24h
        )
        if model_24h:
            trained_models['24h'] = {
                'model': model_24h, 'scaler': scaler_24h,
                'metrics': metrics_24h, 'features': numeric_features_24h,
                'is_fallback': False
            }
    else:
        print(f"⚠️ Using 4h model as 24h fallback")
        if model_4h:
            trained_models['24h'] = {
                'model': model_4h, 'scaler': scaler_4h,
                'metrics': {'name': metrics_4h['name'] + ' (4h fallback)', 'mae': metrics_4h['mae'],
                           'improvement': metrics_4h['improvement'], 
                           'vs_holdout_baseline': metrics_4h.get('vs_holdout_baseline'),
                           'passed_baseline': metrics_4h.get('passed_baseline', False)},
                'features': numeric_features_4h,
                'is_fallback': True
            }
else:
    print(f"⚠️ Using 4h model as 24h fallback ({total_days:.1f} days < 30)")
    if model_4h:
        trained_models['24h'] = {
            'model': model_4h, 'scaler': scaler_4h,
            'metrics': {'name': metrics_4h['name'] + ' (4h fallback)', 'mae': metrics_4h['mae'],
                       'improvement': metrics_4h['improvement'],
                       'vs_holdout_baseline': metrics_4h.get('vs_holdout_baseline'),
                       'passed_baseline': metrics_4h.get('passed_baseline', False)},
            'features': numeric_features_4h,
            'is_fallback': True
        }

# === SUMMARY ===
print(f"\n{'='*70}")
print("TRAINING SUMMARY")
print(f"{'='*70}")

for horizon, data in trained_models.items():
    m = data['metrics']
    status = "✓" if m.get('passed_baseline', False) else "⚠"
    fallback = " (fallback)" if data.get('is_fallback') else ""
    
    # Show vs holdout baseline if available
    if m.get('vs_holdout_baseline') is not None:
        vs_baseline = f"{m['vs_holdout_baseline']*100:+.1f}% vs holdout baseline"
    else:
        vs_baseline = f"{m['improvement']*100:+.1f}% vs train baseline"
    
    has_ensemble = " [+ensemble]" if 'ensemble_predict' in data else ""
    print(f"{status} {horizon}: {m['name']}{fallback} | MAE: {m['mae']:.4f} | {vs_baseline}{has_ensemble}")

if regime_specific_models:
    print(f"\nRegime-specific models:")
    for horizon, regime_dict in regime_specific_models.items():
        for regime_val, regime_data in regime_dict.items():
            print(f"  {horizon}_{regime_data['regime_name']}: MAE={regime_data['metrics']['mae']:.4f}")

FEATURE_IMPORTANCE = all_feature_importance.get('4h', all_feature_importance.get('1h', {}))


In [None]:
# PREDICTION INTERVALS - WITH UNCERTAINTY SCALING
from sklearn.ensemble import GradientBoostingRegressor, IsolationForest

print("\n" + "="*60)
print("TRAINING PREDICTION INTERVALS (Conformal + Uncertainty Scaling)")
print("="*60)

quantile_models = {}
conformal_residuals = {}
uncertainty_scalers = {}

def train_conformal_intervals(X, y, model, scaler, horizon, alpha=0.2):
    """Conformal prediction for guaranteed coverage. alpha=0.2 means 80% interval"""
    cal_size = int(len(X) * 0.2)
    X_train, X_cal = X.iloc[:-cal_size], X.iloc[-cal_size:]
    y_train, y_cal = y.iloc[:-cal_size], y.iloc[-cal_size:]
    
    X_cal_scaled = scaler.transform(X_cal)
    y_pred_cal = model.predict(X_cal_scaled)
    
    residuals = np.abs(y_cal.values - y_pred_cal)
    q = np.quantile(residuals, 1 - alpha)
    
    return {
        'quantile': q,
        'residuals': residuals,
        'coverage_target': 1 - alpha
    }

def train_uncertainty_scaler(X_train, scaler, residuals, features):
    """
    Train a model to predict when uncertainty should be higher.
    Uses:
    1. Isolation Forest to detect out-of-distribution samples
    2. Volatility features to detect high-uncertainty periods
    """
    X_scaled = scaler.transform(X_train)
    
    # Train Isolation Forest to detect OOD samples
    iso_forest = IsolationForest(
        n_estimators=50, contamination=0.1,
        random_state=42, n_jobs=-1
    )
    iso_forest.fit(X_scaled)
    
    # Calculate feature statistics for OOD detection
    feature_means = X_scaled.mean(axis=0)
    feature_stds = X_scaled.std(axis=0) + 1e-8
    
    # Calculate baseline interval width
    base_interval = np.quantile(residuals, 0.8)
    
    return {
        'iso_forest': iso_forest,
        'feature_means': feature_means,
        'feature_stds': feature_stds,
        'base_interval': base_interval,
        'features': features
    }

def calculate_uncertainty_multiplier(X_sample, uncertainty_scaler, current_volatility=None):
    """
    Calculate how much to scale the prediction interval.
    Returns multiplier >= 1.0
    """
    multiplier = 1.0
    
    # 1. Out-of-distribution detection (Isolation Forest)
    iso_score = uncertainty_scaler['iso_forest'].decision_function(X_sample.reshape(1, -1))[0]
    # iso_score < 0 means anomaly (OOD)
    if iso_score < -0.1:
        ood_multiplier = 1 + abs(iso_score)  # Scale by how anomalous
        multiplier *= min(ood_multiplier, 2.0)  # Cap at 2x
    
    # 2. Distance from training distribution
    z_scores = np.abs((X_sample - uncertainty_scaler['feature_means']) / uncertainty_scaler['feature_stds'])
    max_z = np.max(z_scores)
    if max_z > 3:
        dist_multiplier = 1 + (max_z - 3) * 0.2  # 20% increase per std beyond 3
        multiplier *= min(dist_multiplier, 2.0)
    
    # 3. Volatility regime (if provided)
    if current_volatility is not None:
        if current_volatility == 2:  # Spike regime
            multiplier *= 1.5
        elif current_volatility == 1:  # Elevated regime
            multiplier *= 1.2
    
    return min(multiplier, 3.0)  # Cap total multiplier at 3x

for horizon in ['1h', '4h']:
    if horizon not in trained_models:
        continue
        
    print(f"\n{horizon} prediction intervals...")
    
    data = trained_models[horizon]
    features = data['features']
    
    X_h = df_train_val[features]
    y_h = df_train_val[f'target_{horizon}']
    
    mask = y_h.notna()
    X_h = X_h[mask]
    y_h = y_h[mask]
    
    if len(X_h) < 1000:
        print(f"  ⚠️ Insufficient data for {horizon} intervals, skipping")
        continue
    
    split_idx = int(len(X_h) * 0.8)
    X_train, X_test = X_h.iloc[:split_idx], X_h.iloc[split_idx:]
    y_train, y_test = y_h.iloc[:split_idx], y_h.iloc[split_idx:]
    
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # === Quantile Regression ===
    q_models = {}
    for q in [0.1, 0.5, 0.9]:
        model = GradientBoostingRegressor(
            loss='quantile', alpha=q,
            n_estimators=50, max_depth=4,  # Reduced complexity
            learning_rate=0.1, random_state=42
        )
        model.fit(X_train_scaled, y_train)
        q_models[q] = model
    
    quantile_models[horizon] = (q_models, scaler)
    print(f"  ✓ Quantile models trained (10th, 50th, 90th percentiles)")
    
    # === Conformal Prediction ===
    conformal = train_conformal_intervals(X_h, y_h, data['model'], data['scaler'], horizon, alpha=0.2)
    conformal_residuals[horizon] = conformal
    print(f"  ✓ Conformal interval: ±{conformal['quantile']:.4f} gwei (80% coverage)")
    
    # === Uncertainty Scaler ===
    unc_scaler = train_uncertainty_scaler(X_train, scaler, conformal['residuals'], features)
    uncertainty_scalers[horizon] = unc_scaler
    print(f"  ✓ Uncertainty scaler trained (OOD detection + volatility scaling)")
    
    # === Calibration Check with Uncertainty Scaling ===
    print(f"  Calibration check...")
    
    y_pred_test = data['model'].predict(data['scaler'].transform(X_test))
    
    # Standard quantile interval coverage
    q_low = q_models[0.1].predict(X_test_scaled)
    q_high = q_models[0.9].predict(X_test_scaled)
    q_coverage = np.mean((y_test.values >= q_low) & (y_test.values <= q_high))
    
    # Standard conformal interval coverage
    conf_low = y_pred_test - conformal['quantile']
    conf_high = y_pred_test + conformal['quantile']
    conf_coverage = np.mean((y_test.values >= conf_low) & (y_test.values <= conf_high))
    
    # Scaled conformal interval coverage (with uncertainty multipliers)
    scaled_coverages = []
    for i, (idx, row) in enumerate(X_test.iterrows()):
        x_scaled = X_test_scaled[i]
        multiplier = calculate_uncertainty_multiplier(x_scaled, unc_scaler)
        scaled_interval = conformal['quantile'] * multiplier
        in_interval = (y_test.iloc[i] >= y_pred_test[i] - scaled_interval) and \
                      (y_test.iloc[i] <= y_pred_test[i] + scaled_interval)
        scaled_coverages.append(in_interval)
    scaled_coverage = np.mean(scaled_coverages)
    
    print(f"    Quantile 80% interval: actual coverage = {q_coverage:.1%}")
    print(f"    Conformal 80% interval: actual coverage = {conf_coverage:.1%}")
    print(f"    Scaled conformal interval: actual coverage = {scaled_coverage:.1%}")
    
    # Store calibration results
    trained_models[horizon]['calibration'] = {
        'quantile_coverage': q_coverage,
        'conformal_coverage': conf_coverage,
        'scaled_coverage': scaled_coverage,
        'conformal_width': conformal['quantile']
    }
    
    if abs(q_coverage - 0.8) > 0.1:
        print(f"    ⚠️ Quantile intervals may be miscalibrated")
    if abs(conf_coverage - 0.8) > 0.1:
        print(f"    ⚠️ Conformal intervals may need recalibration")

# Copy 4h to 24h if available
if '4h' in quantile_models:
    quantile_models['24h'] = quantile_models['4h']
    print("\n24h: Using 4h quantile models")

if '4h' in conformal_residuals:
    conformal_residuals['24h'] = conformal_residuals['4h']

if '4h' in uncertainty_scalers:
    uncertainty_scalers['24h'] = uncertainty_scalers['4h']

print(f"\n✓ Prediction intervals with uncertainty scaling ready for: {list(quantile_models.keys())}")


In [None]:
# Direction Prediction - IMPROVED
# Changes: Binary up/down, class weights, holdout evaluation, adaptive threshold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

print("\n" + "="*60)
print("TRAINING DIRECTION MODELS (IMPROVED)")
print("="*60)

direction_models = {}

# Configuration
USE_BINARY = True  # Binary (up/down) vs 3-class (down/stable/up)
DIRECTION_THRESHOLD = 0.01  # 1% threshold for direction change

def create_binary_direction(target, current, threshold=0.01):
    """Create binary direction labels: 1=up, 0=down/stable"""
    pct_change = (target - current) / (current + 1e-8)
    return (pct_change > threshold).astype(int)

def create_ternary_direction(target, current, threshold=0.02):
    """Create 3-class direction labels"""
    pct_change = (target - current) / (current + 1e-8)
    direction = pd.Series('stable', index=target.index)
    direction[pct_change > threshold] = 'up'
    direction[pct_change < -threshold] = 'down'
    return direction

for horizon in ['1h', '4h']:
    print(f"\n{'='*50}")
    print(f"{horizon.upper()} DIRECTION MODEL")
    print(f"{'='*50}")
    
    # Get features and targets
    X_h = X_1h if horizon == '1h' else X_4h
    features = numeric_features_1h if horizon == '1h' else numeric_features_4h
    
    # Get raw target for direction calculation
    if 'target_1h_raw' in df_train_val.columns:
        target_raw = df_train_val[f'target_{horizon}_raw']
    else:
        target_raw = df_train_val[f'target_{horizon}']
    
    current = df_train_val['gas']
    
    # Create direction labels
    if USE_BINARY:
        y_dir = create_binary_direction(target_raw, current, DIRECTION_THRESHOLD)
        print(f"Binary classification (threshold: {DIRECTION_THRESHOLD*100}%)")
    else:
        y_dir = create_ternary_direction(target_raw, current)
        print(f"3-class classification (threshold: {DIRECTION_THRESHOLD*100}%)")
    
    mask = y_dir.notna() & target_raw.notna()
    X_d = X_h[mask]
    y_d = y_dir[mask]
    
    if len(X_d) < 1000:
        print(f"  ⚠️ Insufficient data, skipping")
        continue
    
    # Class distribution
    class_counts = y_d.value_counts()
    print(f"Class distribution: {dict(class_counts)}")
    
    # Compute class weights
    classes = np.unique(y_d)
    weights = compute_class_weight('balanced', classes=classes, y=y_d)
    class_weight_dict = dict(zip(classes, weights))
    print(f"Class weights: {class_weight_dict}")
    
    # Split - use holdout if available
    if HAS_HOLDOUT:
        X_train, X_test = X_d, df_holdout[features]
        y_train = y_d
        
        # Create holdout labels
        if 'target_1h_raw' in df_holdout.columns:
            holdout_target = df_holdout[f'target_{horizon}_raw']
        else:
            holdout_target = df_holdout[f'target_{horizon}']
        holdout_current = df_holdout['gas']
        
        if USE_BINARY:
            y_test = create_binary_direction(holdout_target, holdout_current, DIRECTION_THRESHOLD)
        else:
            y_test = create_ternary_direction(holdout_target, holdout_current)
        
        test_mask = y_test.notna() & holdout_target.notna()
        X_test = X_test[test_mask]
        y_test = y_test[test_mask]
        print(f"Using holdout for evaluation ({len(y_test)} samples)")
    else:
        split_idx = int(len(X_d) * 0.8)
        X_train, X_test = X_d.iloc[:split_idx], X_d.iloc[split_idx:]
        y_train, y_test = y_d.iloc[:split_idx], y_d.iloc[split_idx:]
    
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Try multiple classifiers
    classifiers = [
        ('LogReg', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)),
        ('RF', RandomForestClassifier(n_estimators=30, max_depth=4, class_weight='balanced', random_state=42, n_jobs=-1)),
        ('GBM', GradientBoostingClassifier(n_estimators=30, max_depth=3, learning_rate=0.1, random_state=42)),
    ]
    
    best_clf = None
    best_acc = 0
    best_name = None
    
    for name, clf in classifiers:
        try:
            clf.fit(X_train_scaled, y_train)
            y_pred = clf.predict(X_test_scaled)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            print(f"  {name}: Acc={acc:.1%}, F1={f1:.3f}")
            
            if acc > best_acc:
                best_acc = acc
                best_clf = clf
                best_name = name
                best_f1 = f1
        except Exception as e:
            print(f"  {name}: Failed - {e}")
    
    if best_clf is None:
        print(f"  ⚠️ All classifiers failed")
        continue
    
    # Baseline: always predict majority class
    majority_class = y_train.mode()[0]
    baseline_acc = (y_test == majority_class).mean()
    improvement = (best_acc - baseline_acc) / baseline_acc * 100
    
    print(f"\n  >>> Best: {best_name} (Acc: {best_acc:.1%}, vs baseline {baseline_acc:.1%}: {improvement:+.1f}%)")
    
    direction_models[horizon] = {
        'model': best_clf,
        'scaler': scaler,
        'accuracy': float(best_acc),
        'f1_score': float(best_f1),
        'baseline_accuracy': float(baseline_acc),
        'improvement_vs_baseline': float(improvement),
        'model_name': best_name,
        'is_binary': USE_BINARY,
        'threshold': DIRECTION_THRESHOLD
    }

# Summary
print(f"\n{'='*60}")
print("DIRECTION MODEL SUMMARY")
print(f"{'='*60}")
for horizon, data in direction_models.items():
    imp = data['improvement_vs_baseline']
    status = "✓" if imp > 5 else "⚠" if imp > 0 else "✗"
    print(f"{status} {horizon}: {data['model_name']} | Acc: {data['accuracy']:.1%} | vs baseline: {imp:+.1f}%")


In [None]:
# REGIME DETECTION
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("\n" + "="*60)
print("TRAINING REGIME DETECTION MODEL")
print("="*60)

# Create regime labels from gas statistics (instead of volatility_regime)
# 0 = Normal, 1 = Elevated, 2 = Spike
if 'gas_zscore_1h' in df_train_val.columns and 'is_spike' in df_train_val.columns:
    # Create regime from z-score: low (<-0.5), normal (-0.5 to 1), elevated (1 to 2), spike (>2)
    zscore = df_train_val['gas_zscore_1h']
    is_spike = df_train_val['is_spike']
    
    regime_labels = pd.Series(0, index=df_train_val.index)  # Default: Normal
    regime_labels[zscore > 1] = 1  # Elevated
    regime_labels[is_spike == 1] = 2  # Spike
    
    X_r = X_4h.copy()
    y_r = regime_labels
    
    if len(X_r) < 500:
        print("⚠️ Insufficient data for regime detection")
        regime_clf = None
        regime_scaler = None
        regime_accuracy = 0
    else:
        # Train/test split
        split_idx = int(len(X_r) * 0.8)
        X_train, X_test = X_r.iloc[:split_idx], X_r.iloc[split_idx:]
        y_train, y_test = y_r.iloc[:split_idx], y_r.iloc[split_idx:]
        
        regime_scaler = RobustScaler()
        X_train_scaled = regime_scaler.fit_transform(X_train)
        X_test_scaled = regime_scaler.transform(X_test)
        
        # Train classifier (simple, reduced complexity)
        regime_clf = RandomForestClassifier(
            n_estimators=30, max_depth=4,
            min_samples_leaf=20,
            random_state=42, n_jobs=-1
        )
        regime_clf.fit(X_train_scaled, y_train)
        
        # Evaluate
        y_pred = regime_clf.predict(X_test_scaled)
        regime_accuracy = accuracy_score(y_test, y_pred)
        
        print(f"Regime classes: Normal (0), Elevated (1), Spike (2)")
        print(f"Class distribution: {dict(y_r.value_counts().sort_index())}")
        print(f"Accuracy: {regime_accuracy:.1%}")
        
        if regime_accuracy > 0.95:
            print("⚠️ Warning: Very high accuracy may indicate class imbalance or overfitting")
else:
    regime_clf = None
    regime_scaler = None
    regime_accuracy = 0
    print("⚠️ Missing gas_zscore_1h or is_spike, skipping regime detection")


In [None]:
# Train Spike Detectors
from sklearn.ensemble import GradientBoostingClassifier

print("\n" + "="*60)
print("TRAINING SPIKE DETECTORS")
print("="*60)

spike_models = {}

for horizon, X_h, y_target in [('1h', X_1h, y_1h), ('4h', X_4h, y_4h)]:
    print(f"\n{horizon} spike detector...")
    
    # Create spike labels (>2 std from mean is a spike)
    mask = y_target.notna()
    X_s = X_h[mask]
    y_s = y_target[mask]
    current = current_gas[mask]
    
    # Define spike threshold
    price_change = y_s - current
    threshold = price_change.std() * 2
    spike_labels = (price_change > threshold).astype(int)
    
    spike_rate = spike_labels.mean()
    print(f"  Spike rate: {spike_rate:.1%}")
    
    if spike_rate < 0.01 or spike_rate > 0.5:
        print(f"  ⚠️ Unusual spike rate, skipping")
        continue
    
    if len(X_s) < 1000:
        print(f"  ⚠️ Insufficient data, skipping")
        continue
    
    # Train/test split
    split_idx = int(len(X_s) * 0.8)
    X_train, X_test = X_s.iloc[:split_idx], X_s.iloc[split_idx:]
    y_train, y_test = spike_labels.iloc[:split_idx], spike_labels.iloc[split_idx:]
    
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train with class weights
    clf = GradientBoostingClassifier(
        n_estimators=50, max_depth=4,
        learning_rate=0.1, random_state=42
    )
    clf.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    spike_models[horizon] = (clf, scaler)
    print(f"  Accuracy: {acc:.1%}")

# Copy 4h to 24h if available
if '4h' in spike_models:
    spike_models['24h'] = spike_models['4h']
    print("\n24h: Using 4h spike detector (fallback)")

print(f"\n✓ Spike detectors trained for: {list(spike_models.keys())}")

In [None]:
# DQN AGENT TRAINING (OPTIONAL)
# This trains a reinforcement learning agent for transaction timing
# Skip if you just need prediction models

TRAIN_DQN = False  # Set to True to train DQN agent

if not TRAIN_DQN:
    print("="*60)
    print("DQN TRAINING SKIPPED (set TRAIN_DQN = True to enable)")
    print("="*60)
    DQN_TRAINED = False

In [None]:
# DQN Training Implementation (runs only if TRAIN_DQN = True)

if TRAIN_DQN:
    print("\n" + "="*60)
    print("TRAINING DQN AGENT")
    print("="*60)
    
    try:
        import torch
        import torch.nn as nn
        import torch.optim as optim
        from collections import deque
        import random
        
        class DQNNetwork(nn.Module):
            def __init__(self, state_dim, action_dim):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(state_dim, 64),
                    nn.ReLU(),
                    nn.Linear(64, 32),
                    nn.ReLU(),
                    nn.Linear(32, action_dim)
                )
            
            def forward(self, x):
                return self.net(x)
        
        class DQNAgent:
            def __init__(self, state_dim, action_dim):
                self.state_dim = state_dim
                self.action_dim = action_dim
                self.epsilon = 1.0
                self.epsilon_min = 0.05
                self.epsilon_decay = 0.995
                self.gamma = 0.99
                self.lr = 0.001
                self.memory = deque(maxlen=10000)
                self.batch_size = 32
                self.training_steps = 0
                
                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                self.model = DQNNetwork(state_dim, action_dim).to(self.device)
                self.target_model = DQNNetwork(state_dim, action_dim).to(self.device)
                self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
                self.update_target()
            
            def update_target(self):
                self.target_model.load_state_dict(self.model.state_dict())
            
            def act(self, state):
                if random.random() < self.epsilon:
                    return random.randint(0, self.action_dim - 1)
                state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    q_values = self.model(state_t)
                return q_values.argmax().item()
            
            def remember(self, state, action, reward, next_state, done):
                self.memory.append((state, action, reward, next_state, done))
            
            def replay(self):
                if len(self.memory) < self.batch_size:
                    return
                
                batch = random.sample(self.memory, self.batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)
                
                states = torch.FloatTensor(states).to(self.device)
                actions = torch.LongTensor(actions).to(self.device)
                rewards = torch.FloatTensor(rewards).to(self.device)
                next_states = torch.FloatTensor(next_states).to(self.device)
                dones = torch.FloatTensor(dones).to(self.device)
                
                current_q = self.model(states).gather(1, actions.unsqueeze(1))
                next_q = self.target_model(next_states).max(1)[0].detach()
                target_q = rewards + (1 - dones) * self.gamma * next_q
                
                loss = nn.MSELoss()(current_q.squeeze(), target_q)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                self.training_steps += 1
                if self.training_steps % 100 == 0:
                    self.update_target()
                
                self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
            
            def save(self, path):
                torch.save(self.model.state_dict(), path)
        
        # Create simple environment
        state_dim = min(30, len(X.columns))  # Limit state size
        action_dim = 2  # 0 = wait, 1 = execute
        
        DQN_AGENT = DQNAgent(state_dim, action_dim)
        
        # Train for a few episodes
        n_episodes = 500
        print(f"Training DQN for {n_episodes} episodes...")
        
        for episode in range(n_episodes):
            # Simple training loop
            for i in range(min(100, len(X) - 1)):
                state = X.iloc[i, :state_dim].values
                action = DQN_AGENT.act(state)
                
                # Simple reward: negative gas price change if executing
                next_gas = current_gas.iloc[i + 1] if i + 1 < len(current_gas) else current_gas.iloc[i]
                reward = -(next_gas - current_gas.iloc[i]) if action == 1 else -0.001  # Small wait penalty
                
                next_state = X.iloc[i + 1, :state_dim].values if i + 1 < len(X) else state
                done = (i >= min(99, len(X) - 2))
                
                DQN_AGENT.remember(state, action, reward, next_state, done)
                DQN_AGENT.replay()
            
            if (episode + 1) % 100 == 0:
                print(f"  Episode {episode + 1}/{n_episodes}, Epsilon: {DQN_AGENT.epsilon:.3f}")
        
        DQN_TRAINED = True
        DQN_METRICS = {
            'episodes': n_episodes,
            'training_steps': DQN_AGENT.training_steps,
            'final_epsilon': float(DQN_AGENT.epsilon)
        }
        print(f"\n✓ DQN training complete ({DQN_AGENT.training_steps} steps)")
        
    except ImportError:
        print("⚠️ PyTorch not available, skipping DQN training")
        DQN_TRAINED = False
    except Exception as e:
        print(f"⚠️ DQN training failed: {e}")
        DQN_TRAINED = False
else:
    DQN_TRAINED = False

In [None]:
# Save all models + MODEL COMPARISON REPORT
import os
from datetime import datetime
import json as json_lib

os.makedirs('saved_models', exist_ok=True)

print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

# === Save prediction models ===
for horizon in ['1h', '4h', '24h']:
    if horizon not in trained_models:
        print(f"⚠️ No {horizon} model to save")
        continue
    
    data = trained_models[horizon]
    model = data['model']
    scaler = data['scaler']
    metrics = data['metrics']
    features = data.get('features', [])
    
    model_data = {
        'model': model,
        'model_name': metrics['name'],
        'metrics': {
            'mae': float(metrics['mae']),
            'improvement': float(metrics['improvement']),
            'vs_holdout_baseline': float(metrics['vs_holdout_baseline']) if metrics.get('vs_holdout_baseline') else None,
            'passed_baseline': bool(metrics.get('passed_baseline', False)),
            'is_fallback': data.get('is_fallback', False)
        },
        'trained_at': datetime.now().isoformat(),
        'feature_names': list(features),
        'feature_scaler': scaler,
        'scaler_type': 'RobustScaler',
        'target_transform': TARGET_TRANSFORM_USED if 'TARGET_TRANSFORM_USED' in dir() else 'none'
    }
    
    if 'cv_mae' in metrics:
        model_data['cv_mae'] = float(metrics['cv_mae'])
    
    if 'conformal_residuals' in dir() and horizon in conformal_residuals:
        model_data['conformal_interval'] = float(conformal_residuals[horizon]['quantile'])
    
    if 'uncertainty_scalers' in dir() and horizon in uncertainty_scalers:
        model_data['uncertainty_scaler'] = uncertainty_scalers[horizon]
    
    joblib.dump(model_data, f'saved_models/model_{horizon}.pkl')
    
    vs_info = f"vs holdout: {metrics['vs_holdout_baseline']*100:+.1f}%" if metrics.get('vs_holdout_baseline') else ""
    print(f"✓ model_{horizon}.pkl ({metrics['name']}, MAE={metrics['mae']:.4f} {vs_info})")
    
    joblib.dump(scaler, f'saved_models/scaler_{horizon}.pkl')

# === Save regime-specific models ===
if 'regime_specific_models' in dir() and regime_specific_models:
    os.makedirs('saved_models/regime_models', exist_ok=True)
    for horizon, regime_dict in regime_specific_models.items():
        for regime_val, regime_data in regime_dict.items():
            regime_model_data = {
                'model': regime_data['model'],
                'scaler': regime_data['scaler'],
                'metrics': regime_data['metrics'],
                'regime_name': regime_data['regime_name'],
                'regime_val': regime_val,
                'n_samples': regime_data['n_samples'],
                'trained_at': datetime.now().isoformat()
            }
            filename = f'saved_models/regime_models/model_{horizon}_{regime_data["regime_name"]}.pkl'
            joblib.dump(regime_model_data, filename)
            print(f"  → {horizon}_{regime_data['regime_name']} regime model")

# === Save feature names ===
default_features = trained_models.get('4h', trained_models.get('1h', {})).get('features', [])
joblib.dump(list(default_features), 'saved_models/feature_names.pkl')
print(f"\n✓ feature_names.pkl ({len(default_features)} features)")

# === Save other models ===
if 'spike_models' in dir() and spike_models:
    for horizon, (clf, scaler) in spike_models.items():
        joblib.dump({'model': clf, 'scaler': scaler}, f'saved_models/spike_detector_{horizon}.pkl')
        print(f"✓ spike_detector_{horizon}.pkl")

if 'regime_clf' in dir() and regime_clf is not None:
    joblib.dump({'model': regime_clf, 'scaler': regime_scaler, 'accuracy': regime_accuracy}, 
                'saved_models/regime_detector.pkl')
    print(f"✓ regime_detector.pkl")

if 'quantile_models' in dir() and quantile_models:
    for horizon, (q_models, q_scaler) in quantile_models.items():
        quantile_data = {'models': q_models, 'scaler': q_scaler, 'quantiles': [0.1, 0.5, 0.9]}
        if 'conformal_residuals' in dir() and horizon in conformal_residuals:
            quantile_data['conformal'] = {'interval_width': float(conformal_residuals[horizon]['quantile'])}
        if 'uncertainty_scalers' in dir() and horizon in uncertainty_scalers:
            quantile_data['uncertainty_scaler'] = uncertainty_scalers[horizon]
        joblib.dump(quantile_data, f'saved_models/quantile_{horizon}.pkl')
        print(f"✓ quantile_{horizon}.pkl")

# === Save training metadata ===
def convert_to_python_types(obj):
    if isinstance(obj, dict):
        return {k: convert_to_python_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_python_types(v) for v in obj]
    elif isinstance(obj, (np.bool_, np.integer)):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif hasattr(obj, 'item'):
        return obj.item()
    else:
        return obj

metadata = {
    'training_timestamp': datetime.now().isoformat(),
    'total_samples': len(df_clean),
    'training_samples': len(df_train_val),
    'holdout_samples': len(df_holdout) if df_holdout is not None else 0,
    'date_range': f"{df_clean.index.min()} to {df_clean.index.max()}",
    'resampling': '30-second intervals',
    'selection_method': 'holdout-based',
    'configuration': {
        'target_transform': TARGET_TRANSFORM_USED if 'TARGET_TRANSFORM_USED' in dir() else 'none',
        'use_rolling_window': USE_ROLLING_WINDOW if 'USE_ROLLING_WINDOW' in dir() else False,
        'rolling_window_days': ROLLING_WINDOW_DAYS if 'ROLLING_WINDOW_DAYS' in dir() else None,
        'auto_adapt_on_shift': AUTO_ADAPT_ON_SHIFT if 'AUTO_ADAPT_ON_SHIFT' in dir() else False,
        'distribution_shift_detected': DISTRIBUTION_SHIFT_DETECTED if 'DISTRIBUTION_SHIFT_DETECTED' in dir() else False,
        'shift_magnitude': SHIFT_MAGNITUDE if 'SHIFT_MAGNITUDE' in dir() else 0
    },
    'features': {'count': len(default_features), 'list': list(default_features)},
    'baselines': BASELINES,
    'models': {},
    'regime_models': {},
    'direction_models': {}
}

for horizon, data in trained_models.items():
    m = data['metrics']
    metadata['models'][horizon] = {
        'name': m['name'],
        'mae': float(m['mae']),
        'improvement_pct': float(m['improvement'] * 100),
        'vs_holdout_baseline_pct': float(m['vs_holdout_baseline'] * 100) if m.get('vs_holdout_baseline') else None,
        'passed_baseline': bool(m.get('passed_baseline', False)),
        'is_fallback': data.get('is_fallback', False)
    }
    if 'cv_mae' in m:
        metadata['models'][horizon]['cv_mae'] = float(m['cv_mae'])
    if 'calibration' in data:
        metadata['models'][horizon]['calibration'] = data['calibration']

if 'regime_specific_models' in dir() and regime_specific_models:
    for horizon, regime_dict in regime_specific_models.items():
        metadata['regime_models'][horizon] = {}
        for regime_val, regime_data in regime_dict.items():
            metadata['regime_models'][horizon][regime_data['regime_name']] = {
                'mae': float(regime_data['metrics']['mae']),
                'n_samples': int(regime_data['n_samples'])
            }

if 'direction_models' in dir() and direction_models:
    for horizon, data in direction_models.items():
        metadata['direction_models'][horizon] = {
            'accuracy': float(data['accuracy']),
            'f1_score': float(data['f1_score']),
            'baseline_accuracy': float(data.get('baseline_accuracy', 0)),
            'improvement_vs_baseline': float(data.get('improvement_vs_baseline', 0)),
            'model_name': data.get('model_name', 'GBM'),
            'is_binary': data.get('is_binary', False)
        }

metadata = convert_to_python_types(metadata)

with open('saved_models/training_metadata.json', 'w') as f:
    json_lib.dump(metadata, f, indent=2)
print(f"\n✓ training_metadata.json")

# === Save feature importance ===
if FEATURE_IMPORTANCE:
    sorted_importance = dict(sorted(FEATURE_IMPORTANCE.items(), key=lambda x: x[1], reverse=True))
    with open('saved_models/feature_importance.json', 'w') as f:
        json_lib.dump(convert_to_python_types(sorted_importance), f, indent=2)
    print(f"✓ feature_importance.json")
else:
    uniform = {f: 1.0/len(default_features) for f in default_features}
    with open('saved_models/feature_importance.json', 'w') as f:
        json_lib.dump(uniform, f, indent=2)
    print(f"✓ feature_importance.json (uniform)")

# === MODEL COMPARISON REPORT ===
print(f"\n{'='*60}")
print("MODEL COMPARISON REPORT")
print(f"{'='*60}")

history_file = 'saved_models/training_history.json'
if os.path.exists(history_file):
    with open(history_file) as f:
        history = json_lib.load(f)
else:
    history = []

# Add current run to history
current_run = {
    'timestamp': datetime.now().isoformat(),
    'models': {},
    'config': metadata['configuration']
}
for horizon, data in trained_models.items():
    m = data['metrics']
    current_run['models'][horizon] = {
        'name': m['name'],
        'mae': float(m['mae']),
        'vs_holdout_baseline': float(m['vs_holdout_baseline']) if m.get('vs_holdout_baseline') else None
    }

history.append(current_run)
# Keep last 10 runs
history = history[-10:]

with open(history_file, 'w') as f:
    json_lib.dump(history, f, indent=2)

# Compare with previous run
if len(history) >= 2:
    prev_run = history[-2]
    print(f"\nComparing with previous run ({prev_run['timestamp'][:16]}):")
    print(f"{'Horizon':<8} {'Prev MAE':<12} {'Curr MAE':<12} {'Change':<12} {'Recommendation'}")
    print("-" * 60)
    
    recommendations = []
    for horizon in ['1h', '4h', '24h']:
        if horizon in current_run['models'] and horizon in prev_run['models']:
            prev_mae = prev_run['models'][horizon]['mae']
            curr_mae = current_run['models'][horizon]['mae']
            change = (curr_mae - prev_mae) / prev_mae * 100
            
            if change < -5:
                rec = "✓ DEPLOY (improved)"
                recommendations.append(('deploy', horizon))
            elif change > 10:
                rec = "✗ KEEP OLD (degraded)"
                recommendations.append(('keep_old', horizon))
            else:
                rec = "~ SIMILAR"
                recommendations.append(('similar', horizon))
            
            print(f"{horizon:<8} {prev_mae:<12.4f} {curr_mae:<12.4f} {change:+.1f}%{'':<6} {rec}")
    
    # Overall recommendation
    deploy_count = sum(1 for r, _ in recommendations if r == 'deploy')
    keep_old_count = sum(1 for r, _ in recommendations if r == 'keep_old')
    
    print(f"\nOVERALL: ", end="")
    if deploy_count > keep_old_count:
        print("✓ RECOMMEND DEPLOYING new models")
    elif keep_old_count > 0:
        print("⚠️ Consider keeping old models (some degradation)")
    else:
        print("~ Models similar to previous run")
else:
    print("\nFirst run - no previous models to compare")

print(f"\n{'='*60}")
print("ALL MODELS SAVED")
print(f"{'='*60}")


In [None]:
# Print final report
print("\n" + "="*70)
print("TRAINING COMPLETE - FINAL REPORT")
print("="*70)

total_days = len(df_clean) / (120 * 24)

print(f"\nDATA SUMMARY")
print(f"   Total samples: {len(df_clean):,} ({total_days:.1f} days)")
print(f"   Training: {len(df_train_val):,} | Holdout: {len(df_holdout) if df_holdout is not None else 0:,}")
print(f"   Date range: {df_clean.index.min()} to {df_clean.index.max()}")
print(f"   ETH price: {'Binance 1-min ✓' if HAS_ETH_PRICE else 'Not available'}")
print(f"   Features: 1h={len(numeric_features_1h)}, 4h={len(numeric_features_4h)}, 24h={len(numeric_features_24h)}")

print(f"\n" + "-"*70)
print(f"{'MODEL PERFORMANCE':^70}")
print("-"*70)
print(f"{'Horizon':<8} {'Model':<15} {'CV MAE':>10} {'Holdout':>10} {'vs Base':>10} {'Status':>12}")
print("-"*70)

for horizon in ['1h', '4h', '24h']:
    if horizon in trained_models:
        data = trained_models[horizon]
        m = data['metrics']
        name = m['name'][:14]
        if data.get('is_fallback'):
            name = name[:10] + '(fb)'
        
        cv_mae = f"{m['mae']:.4f}"
        holdout_mae = f"{data.get('holdout_mae', 0):.4f}" if 'holdout_mae' in data else "N/A"
        improvement = f"{m['improvement']*100:+.1f}%"
        status = "✓ PASS" if m['passed_baseline'] else "✗ FAIL"
        
        print(f"{horizon:<8} {name:<15} {cv_mae:>10} {holdout_mae:>10} {improvement:>10} {status:>12}")

print("-"*70)

# Calibration report
if any('calibration' in trained_models.get(h, {}) for h in ['1h', '4h']):
    print(f"\n" + "-"*70)
    print(f"{'PREDICTION INTERVAL CALIBRATION':^70}")
    print("-"*70)
    print(f"{'Horizon':<10} {'Quantile 80%':>15} {'Conformal 80%':>15} {'Width (gwei)':>15}")
    print("-"*70)
    
    for horizon in ['1h', '4h']:
        if horizon in trained_models and 'calibration' in trained_models[horizon]:
            cal = trained_models[horizon]['calibration']
            q_cov = f"{cal['quantile_coverage']:.1%}"
            c_cov = f"{cal['conformal_coverage']:.1%}"
            width = f"±{cal['conformal_width']:.4f}"
            print(f"{horizon:<10} {q_cov:>15} {c_cov:>15} {width:>15}")
    
    print("-"*70)

# Direction models
if 'direction_models' in dir() and direction_models:
    print(f"\n" + "-"*70)
    print(f"{'DIRECTION PREDICTION':^70}")
    print("-"*70)
    for horizon, data in direction_models.items():
        print(f"  {horizon}: Accuracy={data['accuracy']:.1%}, F1={data['f1_score']:.3f}")

# 24h model status
print(f"\n" + "-"*70)
print(f"{'24H MODEL STATUS':^70}")
print("-"*70)
if '24h' in trained_models:
    if trained_models['24h'].get('is_fallback'):
        print(f"  ⚠️ Using 4h model as fallback (need 30+ days of data)")
        print(f"     Current data: {total_days:.1f} days")
        print(f"     Recommendation: Collect {30 - total_days:.0f} more days before training true 24h model")
    else:
        print(f"  ✓ True 24h model trained with {total_days:.1f} days of data")

# Final recommendation
print(f"\n" + "="*70)
print("RECOMMENDATION")
print("="*70)

all_passed = all(trained_models.get(h, {}).get('metrics', {}).get('passed_baseline', False) 
                 for h in trained_models if h in trained_models)

if all_passed:
    print("✓ All models beat baseline - READY FOR DEPLOYMENT")
    print("\nNext steps:")
    print("  1. Download saved_models/ folder")
    print("  2. Copy to backend/models/saved_models/")
    print("  3. Restart backend")
else:
    failed = [h for h in trained_models 
              if not trained_models[h]['metrics']['passed_baseline']]
    print(f"⚠️ Some models did not pass baseline: {failed}")
    print("\nRecommendations:")
    print("  - Collect more data")
    print("  - Review feature engineering")
    print("  - Only deploy passing models")

In [None]:
# Visualizations - IMPROVED with holdout baseline comparison
import matplotlib.pyplot as plt

print("\n" + "="*60)
print("GENERATING VISUALIZATIONS")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Train vs Holdout Distribution Comparison
ax1 = axes[0, 0]
train_gas = current_gas.values
if HAS_HOLDOUT:
    holdout_gas = df_holdout['gas'].values
    ax1.hist(train_gas, bins=50, alpha=0.6, color='blue', label=f'Train (mean={train_gas.mean():.2f})', density=True)
    ax1.hist(holdout_gas, bins=50, alpha=0.6, color='red', label=f'Holdout (mean={holdout_gas.mean():.2f})', density=True)
    ax1.legend()
    ax1.set_title('Train vs Holdout Distribution')
else:
    ax1.hist(train_gas, bins=50, alpha=0.7, color='blue', edgecolor='black')
    ax1.axvline(train_gas.mean(), color='red', linestyle='--', label=f'Mean: {train_gas.mean():.2f}')
    ax1.legend()
    ax1.set_title('Gas Price Distribution')
ax1.set_xlabel('Gas Price (gwei)')
ax1.set_ylabel('Density' if HAS_HOLDOUT else 'Frequency')

# 2. Model vs HOLDOUT Baseline (not train baseline!)
ax2 = axes[0, 1]
horizons = list(trained_models.keys())
maes = [trained_models[h]['metrics']['mae'] for h in horizons]

# Use holdout baselines if available, otherwise train baselines
baselines = []
for h in horizons:
    h_key = h.replace('24h', '4h')  # 24h uses 4h baseline
    if 'holdout_best' in BASELINES.get(h_key, {}):
        baselines.append(BASELINES[h_key]['holdout_best'])
    else:
        baselines.append(BASELINES.get(h_key, BASELINES['4h'])['best'])

x = np.arange(len(horizons))
width = 0.35
bars1 = ax2.bar(x - width/2, maes, width, label='Model MAE', color='steelblue')
bars2 = ax2.bar(x + width/2, baselines, width, label='Holdout Baseline', color='coral')
ax2.set_xlabel('Horizon')
ax2.set_ylabel('MAE (gwei)')
ax2.set_title('Model vs Holdout Baseline Performance')
ax2.set_xticks(x)
ax2.set_xticklabels(horizons)
ax2.legend()

# Add improvement percentages (vs holdout baseline)
for i, (h, m, b) in enumerate(zip(horizons, maes, baselines)):
    imp = (b - m) / b * 100
    color = 'green' if imp > 0 else 'red'
    y_pos = max(m, b) + 0.02 * max(max(maes), max(baselines))
    ax2.annotate(f'{imp:+.1f}%', xy=(i, y_pos), ha='center', fontsize=10, fontweight='bold', color=color)

# 3. Gas price time series with regime markers
ax3 = axes[1, 0]
sample_size = min(2000, len(df_clean))
sample_df = df_clean.iloc[-sample_size:]
sample_gas = sample_df['gas']

ax3.plot(sample_gas.index, sample_gas.values, linewidth=0.5, alpha=0.8, color='blue')

# Mark holdout period
if HAS_HOLDOUT:
    holdout_start = df_holdout.index[0]
    ax3.axvline(holdout_start, color='red', linestyle='--', linewidth=2, label='Holdout start')
    ax3.legend()

ax3.set_xlabel('Time')
ax3.set_ylabel('Gas Price (gwei)')
ax3.set_title(f'Recent Gas Prices (last {sample_size} samples)')
ax3.tick_params(axis='x', rotation=45)

# 4. Feature importance (top 10)
ax4 = axes[1, 1]
if FEATURE_IMPORTANCE and any(v != list(FEATURE_IMPORTANCE.values())[0] for v in FEATURE_IMPORTANCE.values()):
    # Non-uniform importance
    sorted_imp = sorted(FEATURE_IMPORTANCE.items(), key=lambda x: x[1], reverse=True)[:10]
    features_plot = [f[0][:20] for f in sorted_imp]
    importances = [f[1] for f in sorted_imp]
    
    y_pos = np.arange(len(features_plot))
    ax4.barh(y_pos, importances, color='teal')
    ax4.set_yticks(y_pos)
    ax4.set_yticklabels(features_plot)
    ax4.invert_yaxis()
    ax4.set_xlabel('Importance')
    ax4.set_title('Top 10 Feature Importance (Permutation)')
else:
    ax4.text(0.5, 0.5, 'Feature importance uniform\n(Huber model)', ha='center', va='center', fontsize=12)
    ax4.set_title('Feature Importance')
    ax4.set_xlim(0, 1)
    ax4.set_ylim(0, 1)

plt.tight_layout()
plt.savefig('saved_models/training_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Saved training_results.png")

# === ADDITIONAL: Distribution shift visualization ===
if HAS_HOLDOUT and DISTRIBUTION_SHIFT_DETECTED:
    fig2, axes2 = plt.subplots(1, 2, figsize=(12, 4))
    
    for i, horizon in enumerate(['1h', '4h']):
        ax = axes2[i]
        train_target = df_train_val[f'target_{horizon}'].dropna()
        holdout_target = df_holdout[f'target_{horizon}'].dropna()
        
        ax.hist(train_target, bins=50, alpha=0.6, color='blue', label='Train', density=True)
        ax.hist(holdout_target, bins=50, alpha=0.6, color='red', label='Holdout', density=True)
        ax.set_xlabel(f'{horizon} Target (gwei)')
        ax.set_ylabel('Density')
        ax.set_title(f'{horizon} Target Distribution')
        ax.legend()
    
    plt.tight_layout()
    plt.savefig('saved_models/distribution_shift.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved distribution_shift.png")


In [None]:
# Create zip file for download
import shutil

shutil.make_archive('gweizy_models', 'zip', 'saved_models')
print("\n✅ Created gweizy_models.zip")
print("\nDownload this file and extract to: backend/models/saved_models/")

# Auto-download
files.download('gweizy_models.zip')