# Gweizy Model Training Notebook

Train all gas prediction models for Gweizy.

## Instructions:
1. Upload your `gas_data.db` file (from `backend/gas_data.db`)
2. Run all cells
3. Download the trained models zip file
4. Extract to `backend/models/saved_models/` and push to GitHub

In [None]:
# Install dependencies
!pip install -q scikit-learn pandas numpy joblib lightgbm xgboost matplotlib seaborn optuna

In [None]:
# Upload your gas_data.db file
from google.colab import files
import os

print("Upload your gas_data.db file from backend/gas_data.db")
uploaded = files.upload()

if 'gas_data.db' in uploaded:
    print(f"\n✅ Uploaded gas_data.db ({len(uploaded['gas_data.db']) / 1024 / 1024:.1f} MB)")
else:
    print("❌ Please upload gas_data.db")

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load data from database
conn = sqlite3.connect('gas_data.db')
df = pd.read_sql("""
    SELECT timestamp, current_gas as gas, base_fee, priority_fee, 
           block_number, gas_used, gas_limit, utilization
    FROM gas_prices ORDER BY timestamp ASC
""", conn)
conn.close()

df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp').sort_index()

print(f"Total records: {len(df):,}")
print(f"Date range: {df.index.min()} to {df.index.max()}")

# Resample to 1-minute (reduces noise, easier to work with)
print("\nResampling to 1-minute intervals...")
df = df.resample('1min').mean().dropna(subset=['gas'])
print(f"After resample: {len(df):,} records")

# Find segments (gap > 30 min = new segment)
df['time_diff'] = df.index.to_series().diff()
df['segment'] = (df['time_diff'] > pd.Timedelta(minutes=30)).cumsum()

segment_sizes = df.groupby('segment').size()
print(f"\nSegments found: {len(segment_sizes)}")
print(f"Segment sizes: {segment_sizes.sort_values(ascending=False).head(10).tolist()}")

# Keep segments with at least 120 minutes (2 hours) of data
MIN_SEGMENT_SIZE = 120
good_segments = segment_sizes[segment_sizes >= MIN_SEGMENT_SIZE].index.tolist()
df = df[df['segment'].isin(good_segments)]
print(f"\nKeeping {len(good_segments)} segments with >= {MIN_SEGMENT_SIZE} minutes")
print(f"Total usable records: {len(df):,}")

RECORDS_PER_HOUR = 60

In [None]:
# Feature Engineering - IMPROVED with micro-features for 1h model
# Key: Use SHORT windows (max 4h) + MICRO windows (5min, 15min, 30min) for 1h

print("Engineering features with MICRO + SHORT windows...")

def engineer_features_for_segment(seg_df):
    """Engineer features for a single continuous segment"""
    df = seg_df.copy()
    rph = 60  # records per hour (1-min intervals)
    
    # === Log transform gas (helps with skewed distribution) ===
    df['gas_log'] = np.log1p(df['gas'])
    
    # === Time features (ENHANCED) ===
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['day_of_week'] = df.index.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17)).astype(int)
    # Peak hours for Ethereum (typically 14:00-22:00 UTC)
    df['is_peak_hours'] = ((df['hour'] >= 14) & (df['hour'] <= 22)).astype(int)
    
    # === MICRO Lag features (for 1h prediction) ===
    for lag_mins in [5, 10, 15, 30]:
        df[f'gas_lag_{lag_mins}min'] = df['gas'].shift(lag_mins)
        df[f'gas_change_{lag_mins}min'] = df['gas'] - df['gas'].shift(lag_mins)
        df[f'gas_pct_change_{lag_mins}min'] = df['gas'].pct_change(lag_mins)
    
    # === MICRO Rolling stats (5min, 15min, 30min windows) ===
    for window_mins in [5, 15, 30]:
        df[f'gas_mean_{window_mins}min'] = df['gas'].rolling(window_mins, min_periods=window_mins//2).mean()
        df[f'gas_std_{window_mins}min'] = df['gas'].rolling(window_mins, min_periods=window_mins//2).std()
        df[f'gas_min_{window_mins}min'] = df['gas'].rolling(window_mins, min_periods=window_mins//2).min()
        df[f'gas_max_{window_mins}min'] = df['gas'].rolling(window_mins, min_periods=window_mins//2).max()
        # Volatility
        df[f'gas_range_{window_mins}min'] = df[f'gas_max_{window_mins}min'] - df[f'gas_min_{window_mins}min']
        df[f'gas_cv_{window_mins}min'] = df[f'gas_std_{window_mins}min'] / (df[f'gas_mean_{window_mins}min'] + 1e-8)
    
    # === Standard Lag features (hours) ===
    for lag_hours in [1, 2, 4]:
        df[f'gas_lag_{lag_hours}h'] = df['gas'].shift(lag_hours * rph)
        df[f'gas_log_lag_{lag_hours}h'] = df['gas_log'].shift(lag_hours * rph)
    
    # === Rolling stats (SHORT windows: 1h, 2h, 4h) ===
    for window_hours in [1, 2, 4]:
        window = window_hours * rph
        df[f'gas_mean_{window_hours}h'] = df['gas'].rolling(window, min_periods=window//2).mean()
        df[f'gas_std_{window_hours}h'] = df['gas'].rolling(window, min_periods=window//2).std()
        df[f'gas_min_{window_hours}h'] = df['gas'].rolling(window, min_periods=window//2).min()
        df[f'gas_max_{window_hours}h'] = df['gas'].rolling(window, min_periods=window//2).max()
        df[f'gas_median_{window_hours}h'] = df['gas'].rolling(window, min_periods=window//2).median()
        
        # EMA (Exponential Moving Average)
        df[f'gas_ema_{window_hours}h'] = df['gas'].ewm(span=window, min_periods=window//2).mean()
        
        # Volatility features
        df[f'gas_cv_{window_hours}h'] = df[f'gas_std_{window_hours}h'] / (df[f'gas_mean_{window_hours}h'] + 1e-8)
        df[f'gas_range_{window_hours}h'] = df[f'gas_max_{window_hours}h'] - df[f'gas_min_{window_hours}h']
        df[f'gas_range_pct_{window_hours}h'] = df[f'gas_range_{window_hours}h'] / (df[f'gas_mean_{window_hours}h'] + 1e-8)
    
    # === MICRO Momentum (for 1h) ===
    for mins in [5, 15, 30]:
        df[f'momentum_{mins}min'] = df['gas'] - df['gas'].shift(mins)
        df[f'momentum_pct_{mins}min'] = df['gas'].pct_change(mins)
        # Acceleration (rate of change of momentum)
        df[f'acceleration_{mins}min'] = df[f'momentum_{mins}min'] - df[f'momentum_{mins}min'].shift(mins)
    
    # === Standard Momentum ===
    for hours in [1, 2]:
        periods = hours * rph
        df[f'momentum_{hours}h'] = df['gas'] - df['gas'].shift(periods)
        df[f'momentum_pct_{hours}h'] = df['gas'].pct_change(periods)
        df[f'acceleration_{hours}h'] = df[f'momentum_{hours}h'] - df[f'momentum_{hours}h'].shift(periods)
        df[f'direction_{hours}h'] = np.sign(df[f'momentum_{hours}h'])
    
    # === Z-score ===
    for hours in [1, 2, 4]:
        df[f'gas_zscore_{hours}h'] = (df['gas'] - df[f'gas_mean_{hours}h']) / (df[f'gas_std_{hours}h'] + 1e-8)
    
    # === Trend indicators ===
    df['trend_15min_1h'] = df['gas_mean_15min'] / (df['gas_mean_1h'] + 1e-8)
    df['trend_30min_1h'] = df['gas_mean_30min'] / (df['gas_mean_1h'] + 1e-8)
    df['trend_1h_2h'] = df['gas_mean_1h'] / (df['gas_mean_2h'] + 1e-8)
    df['trend_1h_4h'] = df['gas_mean_1h'] / (df['gas_mean_4h'] + 1e-8)
    df['ema_trend_short'] = df['gas_ema_1h'] / (df['gas_ema_2h'] + 1e-8)
    df['ema_trend_long'] = df['gas_ema_1h'] / (df['gas_ema_4h'] + 1e-8)
    
    # === Price position (where is current price in recent range) ===
    for window in ['30min', '1h', '2h', '4h']:
        col_max = f'gas_max_{window}'
        col_min = f'gas_min_{window}'
        if col_max in df.columns and col_min in df.columns:
            range_size = df[col_max] - df[col_min]
            df[f'price_position_{window}'] = (df['gas'] - df[col_min]) / (range_size + 1e-8)
    
    # === Recent volatility regime (for confidence) ===
    df['volatility_regime'] = pd.cut(
        df['gas_cv_1h'], 
        bins=[0, 0.05, 0.15, float('inf')], 
        labels=[0, 1, 2]  # 0=Low, 1=Medium, 2=High
    ).astype(float)
    
    # === Targets (absolute) ===
    df['target_1h'] = df['gas'].shift(-1 * rph)
    df['target_4h'] = df['gas'].shift(-4 * rph)
    df['target_24h'] = df['gas'].shift(-4 * rph)  # Actually 4h (honest labeling)
    
    # === Targets (percentage change - more stable) ===
    df['target_pct_1h'] = (df['target_1h'] - df['gas']) / (df['gas'] + 1e-8)
    df['target_pct_4h'] = (df['target_4h'] - df['gas']) / (df['gas'] + 1e-8)
    
    # === Direction targets (for classification) ===
    # Use threshold to avoid noisy "stable" predictions
    threshold = 0.02  # 2% change threshold
    
    def classify_direction(pct_change, threshold):
        if pct_change < -threshold:
            return 0  # Down
        elif pct_change > threshold:
            return 2  # Up
        else:
            return 1  # Stable
    
    df['direction_class_1h'] = df['target_pct_1h'].apply(lambda x: classify_direction(x, threshold))
    df['direction_class_4h'] = df['target_pct_4h'].apply(lambda x: classify_direction(x, threshold))
    
    return df

# Process each segment independently
print("Processing segments independently...")
all_features = []

for seg_id in df['segment'].unique():
    seg_df = df[df['segment'] == seg_id].drop(columns=['segment', 'time_diff'])
    if len(seg_df) >= MIN_SEGMENT_SIZE:
        featured = engineer_features_for_segment(seg_df)
        all_features.append(featured)
        print(f"  Segment {seg_id}: {len(seg_df)} → {len(featured.dropna())} usable rows")

# Combine all segments
df_features = pd.concat(all_features)
df_features = df_features.replace([np.inf, -np.inf], np.nan)

print(f"\nTotal featured samples: {len(df_features):,}")
print(f"After dropping NaN: {len(df_features.dropna()):,}")

In [None]:
# Prepare training data with feature selection
from sklearn.preprocessing import RobustScaler

# Columns to exclude from features
exclude_cols = ['gas', 'gas_log', 'base_fee', 'priority_fee', 'block_number', 
                'gas_used', 'gas_limit', 'utilization',
                'target_1h', 'target_4h', 'target_24h',
                'target_pct_1h', 'target_pct_4h',
                'direction_class_1h', 'direction_class_4h',
                'volatility_regime']

feature_cols = [c for c in df_features.columns if c not in exclude_cols]
print(f"Initial feature columns: {len(feature_cols)}")

# Drop rows with NaN
df_clean = df_features.dropna()
print(f"Clean samples: {len(df_clean):,}")

# === Feature Selection: Remove highly correlated features (>0.90 for small dataset) ===
print("\nRemoving highly correlated features (>0.90)...")
X_temp = df_clean[feature_cols]
corr_matrix = X_temp.corr().abs()

# Find pairs with correlation > 0.90 (stricter for small dataset)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(f"  Dropping {len(to_drop)} highly correlated features")

feature_cols = [c for c in feature_cols if c not in to_drop]
print(f"Final feature columns: {len(feature_cols)}")

# === Create horizon-specific feature sets ===
# 1h model benefits from micro-features
micro_features = [c for c in feature_cols if 'min' in c or 'micro' in c.lower()]
hour_features = [c for c in feature_cols if 'h' in c and 'min' not in c]
time_features = [c for c in feature_cols if any(t in c for t in ['hour', 'day', 'sin', 'cos', 'weekend', 'business', 'peak'])]
trend_features = [c for c in feature_cols if 'trend' in c or 'position' in c or 'zscore' in c]

# 1h: prioritize micro-features + short-term
features_1h = list(set(micro_features + time_features + trend_features + [c for c in feature_cols if '1h' in c or '2h' in c]))
features_1h = [c for c in features_1h if c in feature_cols]

# 4h: use all features but weight longer-term
features_4h = feature_cols  # Use all for 4h

print(f"\n1h model features: {len(features_1h)}")
print(f"4h model features: {len(features_4h)}")

# Prepare data
X = df_clean[feature_cols]
X_1h = df_clean[[c for c in features_1h if c in df_clean.columns]]
X_4h = df_clean[[c for c in features_4h if c in df_clean.columns]]

y_1h = df_clean['target_1h']
y_4h = df_clean['target_4h']
y_24h = df_clean['target_24h']

# Percentage targets (alternative)
y_pct_1h = df_clean['target_pct_1h']
y_pct_4h = df_clean['target_pct_4h']

# Direction targets for classification
y_dir_1h = df_clean['direction_class_1h']
y_dir_4h = df_clean['direction_class_4h']

# Volatility regime for confidence
volatility_regime = df_clean['volatility_regime']

# Store current gas for baseline
current_gas = df_clean['gas']

# === Baseline Models ===
print(f"\n{'='*50}")
print("BASELINE COMPARISONS")
print("='*50}")

# Naive baseline: predict last known value
naive_pred_1h = current_gas.values
naive_mae_1h = np.mean(np.abs(y_1h.values - naive_pred_1h))
naive_mae_4h = np.mean(np.abs(y_4h.values - naive_pred_1h))

# Mean baseline: predict historical mean
mean_pred = np.full_like(y_1h.values, y_1h.mean())
mean_mae_1h = np.mean(np.abs(y_1h.values - mean_pred))
mean_mae_4h = np.mean(np.abs(y_4h.values - mean_pred))

# Drift baseline: extrapolate recent trend
drift_pred_1h = current_gas.values + df_clean['momentum_1h'].values
drift_mae_1h = np.mean(np.abs(y_1h.values - drift_pred_1h))

print(f"\nBaseline MAEs:")
print(f"  Naive (current price):     MAE_1h={naive_mae_1h:.6f}, MAE_4h={naive_mae_4h:.6f}")
print(f"  Mean (historical average): MAE_1h={mean_mae_1h:.6f}, MAE_4h={mean_mae_4h:.6f}")
print(f"  Drift (extrapolate trend): MAE_1h={drift_mae_1h:.6f}")

# Use best baseline for comparison
best_baseline_1h = min(naive_mae_1h, mean_mae_1h, drift_mae_1h)
best_baseline_4h = min(naive_mae_4h, mean_mae_4h)

print(f"\n  Best baseline 1h: {best_baseline_1h:.6f}")
print(f"  Best baseline 4h: {best_baseline_4h:.6f}")

# Store baselines for comparison
BASELINES = {
    '1h': {'naive_mae': naive_mae_1h, 'mean_mae': mean_mae_1h, 'drift_mae': drift_mae_1h, 'best': best_baseline_1h},
    '4h': {'naive_mae': naive_mae_4h, 'mean_mae': mean_mae_4h, 'best': best_baseline_4h}
}

print(f"\n{'='*50}")
print("TRAINING DATA SUMMARY")
print("{'='*50}")
print(f"Samples: {len(X):,}")
print(f"Features (all): {len(feature_cols)}")
print(f"Features (1h specific): {len(features_1h)}")
print(f"Target 1h range: {y_1h.min():.4f} - {y_1h.max():.4f} gwei")

In [None]:
# Model Training with Hyperparameter Tuning, Simpler Models, and Quantile Regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet, HuberRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')

def time_series_cv(model, X, y, n_splits=5):
    """Time-series cross-validation"""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    scores = {'mae': [], 'r2': []}
    
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_val_scaled)
        
        scores['mae'].append(mean_absolute_error(y_val, pred))
        scores['r2'].append(r2_score(y_val, pred))
    
    return {
        'mae_mean': np.mean(scores['mae']),
        'mae_std': np.std(scores['mae']),
        'r2_mean': np.mean(scores['r2']),
        'r2_std': np.std(scores['r2'])
    }

def train_1h_model(X, y, current_gas, baseline_mae):
    """
    Train 1h model with SIMPLER models optimized for short-term prediction.
    1h is noisy - simpler models with strong regularization work better.
    """
    print(f"\n{'='*60}")
    print("Training 1h models (SIMPLER - optimized for short-term)")
    print("='*60}")
    print(f"Baseline MAE (best): {baseline_mae:.6f}")
    
    # Time-series split
    split_idx = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    gas_test = current_gas.iloc[split_idx:]
    
    print(f"Train: {len(X_train):,}, Test: {len(X_test):,}")
    
    # Scale
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = []
    all_preds = []
    
    # === 1. Ridge (strong regularization for noisy data) ===
    print("\n[1/5] Ridge Regression (high regularization)...")
    ridge = Ridge(alpha=10.0, random_state=42)  # High alpha for regularization
    ridge.fit(X_train_scaled, y_train)
    ridge_pred = ridge.predict(X_test_scaled)
    ridge_metrics = evaluate_model(y_test, ridge_pred, baseline_mae)
    results.append(('Ridge', ridge, ridge_metrics, scaler))
    all_preds.append(ridge_pred)
    print(f"      MAE: {ridge_metrics['mae']:.6f}, vs Baseline: {ridge_metrics['vs_baseline']}")
    
    # === 2. ElasticNet (L1+L2 regularization) ===
    print("[2/5] ElasticNet...")
    elastic = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42, max_iter=5000)
    elastic.fit(X_train_scaled, y_train)
    elastic_pred = elastic.predict(X_test_scaled)
    elastic_metrics = evaluate_model(y_test, elastic_pred, baseline_mae)
    results.append(('ElasticNet', elastic, elastic_metrics, scaler))
    all_preds.append(elastic_pred)
    print(f"      MAE: {elastic_metrics['mae']:.6f}, vs Baseline: {elastic_metrics['vs_baseline']}")
    
    # === 3. Huber Regressor (robust to outliers) ===
    print("[3/5] Huber Regressor (robust to outliers)...")
    huber = HuberRegressor(epsilon=1.35, alpha=1.0, max_iter=1000)
    huber.fit(X_train_scaled, y_train)
    huber_pred = huber.predict(X_test_scaled)
    huber_metrics = evaluate_model(y_test, huber_pred, baseline_mae)
    results.append(('Huber', huber, huber_metrics, scaler))
    all_preds.append(huber_pred)
    print(f"      MAE: {huber_metrics['mae']:.6f}, vs Baseline: {huber_metrics['vs_baseline']}")
    
    # === 4. Small Random Forest (reduced complexity) ===
    print("[4/5] Small Random Forest...")
    rf_small = RandomForestRegressor(
        n_estimators=50, max_depth=5, min_samples_split=20,
        min_samples_leaf=10, random_state=42, n_jobs=-1
    )
    rf_small.fit(X_train_scaled, y_train)
    rf_pred = rf_small.predict(X_test_scaled)
    rf_metrics = evaluate_model(y_test, rf_pred, baseline_mae)
    results.append(('RF_Small', rf_small, rf_metrics, scaler))
    all_preds.append(rf_pred)
    print(f"      MAE: {rf_metrics['mae']:.6f}, vs Baseline: {rf_metrics['vs_baseline']}")
    
    # === 5. LightGBM with aggressive regularization ===
    try:
        import lightgbm as lgb
        print("[5/5] LightGBM (high regularization)...")
        
        val_split = int(len(X_train_scaled) * 0.9)
        X_tr, X_val = X_train_scaled[:val_split], X_train_scaled[val_split:]
        y_tr, y_val = y_train.iloc[:val_split], y_train.iloc[val_split:]
        
        lgbm = lgb.LGBMRegressor(
            n_estimators=200, max_depth=4, learning_rate=0.05,
            num_leaves=15, min_child_samples=30, subsample=0.7,
            colsample_bytree=0.7, reg_alpha=1.0, reg_lambda=1.0,  # Strong regularization
            random_state=42, n_jobs=-1, verbose=-1
        )
        lgbm.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
                 callbacks=[lgb.early_stopping(30, verbose=False)])
        
        lgbm_pred = lgbm.predict(X_test_scaled)
        lgbm_metrics = evaluate_model(y_test, lgbm_pred, baseline_mae)
        results.append(('LightGBM', lgbm, lgbm_metrics, scaler))
        all_preds.append(lgbm_pred)
        print(f"      MAE: {lgbm_metrics['mae']:.6f}, vs Baseline: {lgbm_metrics['vs_baseline']}")
    except Exception as e:
        print(f"[5/5] LightGBM failed: {e}")
    
    # === Ensemble ===
    print("\n[Ensemble] Weighted average (favor simpler models)...")
    # Weight simpler models more for 1h
    weights = [0.25, 0.25, 0.2, 0.15, 0.15] if len(all_preds) == 5 else [1/len(all_preds)] * len(all_preds)
    ensemble_pred = np.average(all_preds, axis=0, weights=weights[:len(all_preds)])
    ensemble_metrics = evaluate_model(y_test, ensemble_pred, baseline_mae)
    print(f"      MAE: {ensemble_metrics['mae']:.6f}, vs Baseline: {ensemble_metrics['vs_baseline']}")
    
    # === Select best ===
    all_results = results + [('Ensemble', [r[1] for r in results], ensemble_metrics, scaler)]
    best = max(all_results, key=lambda x: x[2]['improvement'])
    
    print(f"\n>>> Best 1h model: {best[0]} (MAE: {best[2]['mae']:.6f}, {best[2]['vs_baseline']})")
    
    # Calculate confidence based on volatility
    confidence_scores = calculate_confidence(X_test, y_test, best[1] if best[0] != 'Ensemble' else results[0][1], scaler)
    
    return best, results, list(X.columns), ensemble_pred, y_test, confidence_scores

def train_4h_model(X, y, current_gas, baseline_mae):
    """
    Train 4h model with full model suite + hyperparameter tuning.
    4h is more predictable - can use more complex models.
    """
    print(f"\n{'='*60}")
    print("Training 4h models (FULL SUITE with tuning)")
    print("='*60}")
    print(f"Baseline MAE (best): {baseline_mae:.6f}")
    
    # Time-series split
    split_idx = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Train: {len(X_train):,}, Test: {len(X_test):,}")
    
    # Scale
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = []
    all_preds = []
    
    # === 1. Random Forest with tuning ===
    print("\n[1/4] Random Forest with RandomizedSearchCV...")
    rf_params = {
        'n_estimators': [100, 150, 200],
        'max_depth': [8, 12, 15],
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf': [3, 5, 8]
    }
    
    rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=3)
    rf_search = RandomizedSearchCV(
        rf_base, rf_params, n_iter=10, cv=tscv, 
        scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1
    )
    rf_search.fit(X_train_scaled, y_train)
    rf = rf_search.best_estimator_
    
    rf_pred = rf.predict(X_test_scaled)
    rf_metrics = evaluate_model(y_test, rf_pred, baseline_mae)
    results.append(('RandomForest', rf, rf_metrics, scaler))
    all_preds.append(rf_pred)
    print(f"      Best params: {rf_search.best_params_}")
    print(f"      MAE: {rf_metrics['mae']:.6f}, vs Baseline: {rf_metrics['vs_baseline']}")
    
    # === 2. Gradient Boosting ===
    print("[2/4] Gradient Boosting...")
    gb = GradientBoostingRegressor(
        n_estimators=150, max_depth=6, learning_rate=0.05,
        min_samples_split=10, subsample=0.8, random_state=42
    )
    gb.fit(X_train_scaled, y_train)
    gb_pred = gb.predict(X_test_scaled)
    gb_metrics = evaluate_model(y_test, gb_pred, baseline_mae)
    results.append(('GradientBoosting', gb, gb_metrics, scaler))
    all_preds.append(gb_pred)
    print(f"      MAE: {gb_metrics['mae']:.6f}, vs Baseline: {gb_metrics['vs_baseline']}")
    
    # === 3. LightGBM ===
    try:
        import lightgbm as lgb
        print("[3/4] LightGBM with early stopping...")
        
        val_split = int(len(X_train_scaled) * 0.9)
        X_tr, X_val = X_train_scaled[:val_split], X_train_scaled[val_split:]
        y_tr, y_val = y_train.iloc[:val_split], y_train.iloc[val_split:]
        
        lgbm = lgb.LGBMRegressor(
            n_estimators=500, max_depth=10, learning_rate=0.03,
            num_leaves=31, min_child_samples=20, subsample=0.8,
            colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1,
            random_state=42, n_jobs=-1, verbose=-1
        )
        lgbm.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
                 callbacks=[lgb.early_stopping(50, verbose=False)])
        
        lgbm_pred = lgbm.predict(X_test_scaled)
        lgbm_metrics = evaluate_model(y_test, lgbm_pred, baseline_mae)
        results.append(('LightGBM', lgbm, lgbm_metrics, scaler))
        all_preds.append(lgbm_pred)
        print(f"      MAE: {lgbm_metrics['mae']:.6f}, vs Baseline: {lgbm_metrics['vs_baseline']}")
    except Exception as e:
        print(f"[3/4] LightGBM failed: {e}")
    
    # === 4. XGBoost ===
    try:
        import xgboost as xgb
        print("[4/4] XGBoost with early stopping...")
        
        xgbm = xgb.XGBRegressor(
            n_estimators=500, max_depth=8, learning_rate=0.03,
            min_child_weight=5, subsample=0.8, colsample_bytree=0.8,
            reg_alpha=0.1, reg_lambda=1.0, random_state=42, 
            n_jobs=-1, verbosity=0, early_stopping_rounds=50
        )
        xgbm.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        
        xgbm_pred = xgbm.predict(X_test_scaled)
        xgbm_metrics = evaluate_model(y_test, xgbm_pred, baseline_mae)
        results.append(('XGBoost', xgbm, xgbm_metrics, scaler))
        all_preds.append(xgbm_pred)
        print(f"      MAE: {xgbm_metrics['mae']:.6f}, vs Baseline: {xgbm_metrics['vs_baseline']}")
    except Exception as e:
        print(f"[4/4] XGBoost failed: {e}")
    
    # === Ensemble ===
    print("\n[Ensemble] Average all models...")
    ensemble_pred = np.mean(all_preds, axis=0)
    ensemble_metrics = evaluate_model(y_test, ensemble_pred, baseline_mae)
    print(f"      MAE: {ensemble_metrics['mae']:.6f}, vs Baseline: {ensemble_metrics['vs_baseline']}")
    
    # Select best
    all_results = results + [('Ensemble', [r[1] for r in results], ensemble_metrics, scaler)]
    best = max(all_results, key=lambda x: x[2]['improvement'])
    
    print(f"\n>>> Best 4h model: {best[0]} (MAE: {best[2]['mae']:.6f}, {best[2]['vs_baseline']})")
    
    # Calculate confidence
    confidence_scores = calculate_confidence(X_test, y_test, best[1] if best[0] != 'Ensemble' else results[0][1], scaler)
    
    return best, results, list(X.columns), ensemble_pred, y_test, confidence_scores

def evaluate_model(y_true, y_pred, baseline_mae):
    """Calculate model metrics with baseline comparison"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # Directional accuracy
    if len(y_true) > 1:
        actual_dir = np.sign(np.diff(y_true.values))
        pred_dir = np.sign(np.diff(y_pred))
        dir_acc = np.mean(actual_dir == pred_dir)
    else:
        dir_acc = 0.0
    
    # Compare to baseline
    improvement = (baseline_mae - mae) / baseline_mae * 100
    vs_baseline = f"{improvement:+.1f}%" if improvement != 0 else "0%"
    
    return {
        'mae': mae, 'rmse': rmse, 'r2': r2, 
        'directional_accuracy': dir_acc,
        'vs_baseline': vs_baseline, 'improvement': improvement
    }

def calculate_confidence(X_test, y_test, model, scaler):
    """
    Calculate prediction confidence based on:
    1. Model's prediction variance (if ensemble/tree)
    2. Distance from training distribution
    3. Recent volatility
    """
    X_scaled = scaler.transform(X_test) if not isinstance(X_test, np.ndarray) else X_test
    
    confidences = []
    
    if hasattr(model, 'estimators_'):
        # For ensemble models, use prediction variance across trees
        tree_preds = np.array([tree.predict(X_scaled) for tree in model.estimators_])
        pred_std = np.std(tree_preds, axis=0)
        # Lower std = higher confidence
        max_std = np.percentile(pred_std, 95)
        confidences = 1 - np.clip(pred_std / (max_std + 1e-8), 0, 1)
    else:
        # For other models, use uniform medium confidence
        confidences = np.full(len(X_test), 0.6)
    
    return confidences

print("Training functions defined.")

In [None]:
# Train all models with separate strategies for 1h vs 4h
print("="*60)
print("TRAINING ALL MODELS")
print("="*60)

# Use 1h-specific features for 1h model (micro-features)
print("\n>>> Using micro-features for 1h model")
best_1h, all_1h, features_1h_used, pred_1h, actual_1h, conf_1h = train_1h_model(
    X_1h, y_1h, current_gas, BASELINES['1h']['best']
)

# Use full features for 4h model
print("\n>>> Using full features for 4h model")
best_4h, all_4h, features_4h_used, pred_4h, actual_4h, conf_4h = train_4h_model(
    X_4h, y_4h, current_gas, BASELINES['4h']['best']
)

# 24h model (actually 4h - honest labeling)
print("\n>>> 24h model = 4h model (data limitation)")
print("    Note: '24h' predictions are actually 4h ahead due to insufficient continuous data")
best_24h = best_4h  # Same as 4h
all_24h = all_4h
pred_24h = pred_4h
actual_24h = actual_4h
conf_24h = conf_4h

# Store features used for saving
features = feature_cols  # Use all features for model file

In [None]:
# Direction Prediction (Classification: Down/Stable/Up) - IMPROVED
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

print("\n" + "="*60)
print("DIRECTION PREDICTION (Classification)")
print("="*60)
print("Classes: 0=Down (>2% drop), 1=Stable (<2% change), 2=Up (>2% rise)")

def train_direction_model(X, y_dir, horizon_name, use_class_weights=True):
    """
    Train direction classifier with:
    - Class weights to handle imbalance
    - Multiple model comparison
    - Probability calibration
    """
    print(f"\n{horizon_name} Direction Classifier:")
    
    # Remove NaN
    valid_idx = ~y_dir.isna()
    X_valid = X[valid_idx]
    y_valid = y_dir[valid_idx].astype(int)
    
    # Class distribution
    class_counts = y_valid.value_counts().sort_index()
    total = len(y_valid)
    print(f"  Class distribution:")
    print(f"    Down (0):   {class_counts.get(0,0):5d} ({class_counts.get(0,0)/total*100:.1f}%)")
    print(f"    Stable (1): {class_counts.get(1,0):5d} ({class_counts.get(1,0)/total*100:.1f}%)")
    print(f"    Up (2):     {class_counts.get(2,0):5d} ({class_counts.get(2,0)/total*100:.1f}%)")
    
    # Calculate class weights (inverse frequency)
    if use_class_weights:
        class_weights = {i: total / (3 * count) for i, count in class_counts.items()}
        print(f"  Using class weights: {class_weights}")
    else:
        class_weights = None
    
    # Split
    split_idx = int(len(X_valid) * 0.8)
    X_train, X_test = X_valid.iloc[:split_idx], X_valid.iloc[split_idx:]
    y_train, y_test = y_valid.iloc[:split_idx], y_valid.iloc[split_idx:]
    
    # Scale
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = []
    
    # 1. Random Forest with class weights
    print(f"\n  [1/3] Random Forest...")
    rf_clf = RandomForestClassifier(
        n_estimators=150, max_depth=10, min_samples_split=10,
        class_weight=class_weights, random_state=42, n_jobs=-1
    )
    rf_clf.fit(X_train_scaled, y_train)
    rf_pred = rf_clf.predict(X_test_scaled)
    rf_acc = accuracy_score(y_test, rf_pred)
    rf_f1 = f1_score(y_test, rf_pred, average='weighted')
    results.append(('RandomForest', rf_clf, rf_acc, rf_f1))
    print(f"        Accuracy: {rf_acc:.1%}, F1: {rf_f1:.3f}")
    
    # 2. Gradient Boosting
    print(f"  [2/3] Gradient Boosting...")
    gb_clf = GradientBoostingClassifier(
        n_estimators=100, max_depth=5, learning_rate=0.1,
        random_state=42
    )
    gb_clf.fit(X_train_scaled, y_train)
    gb_pred = gb_clf.predict(X_test_scaled)
    gb_acc = accuracy_score(y_test, gb_pred)
    gb_f1 = f1_score(y_test, gb_pred, average='weighted')
    results.append(('GradientBoosting', gb_clf, gb_acc, gb_f1))
    print(f"        Accuracy: {gb_acc:.1%}, F1: {gb_f1:.3f}")
    
    # 3. Logistic Regression (probability calibration)
    print(f"  [3/3] Logistic Regression...")
    lr_clf = LogisticRegression(
        class_weight=class_weights, max_iter=1000, random_state=42, n_jobs=-1
    )
    lr_clf.fit(X_train_scaled, y_train)
    lr_pred = lr_clf.predict(X_test_scaled)
    lr_acc = accuracy_score(y_test, lr_pred)
    lr_f1 = f1_score(y_test, lr_pred, average='weighted')
    results.append(('LogisticRegression', lr_clf, lr_acc, lr_f1))
    print(f"        Accuracy: {lr_acc:.1%}, F1: {lr_f1:.3f}")
    
    # Baseline: always predict most common class
    most_common = y_train.mode()[0]
    baseline_acc = (y_test == most_common).mean()
    print(f"\n  Baseline (always predict {['Down', 'Stable', 'Up'][most_common]}): {baseline_acc:.1%}")
    
    # Select best by F1 score (better for imbalanced classes)
    best = max(results, key=lambda x: x[3])
    print(f"\n  >>> Best: {best[0]} (Accuracy: {best[2]:.1%}, F1: {best[3]:.3f})")
    print(f"      Improvement over baseline: {(best[2] - baseline_acc)*100:+.1f}%")
    
    # Print classification report for best model
    best_pred = best[1].predict(X_test_scaled)
    print(f"\n  Classification Report ({best[0]}):")
    print(classification_report(y_test, best_pred, target_names=['Down', 'Stable', 'Up']))
    
    return best[1], scaler, best[2], best[3]

# Train with class weights
dir_clf_1h, dir_scaler_1h, dir_acc_1h, dir_f1_1h = train_direction_model(X, y_dir_1h, '1h', use_class_weights=True)
dir_clf_4h, dir_scaler_4h, dir_acc_4h, dir_f1_4h = train_direction_model(X, y_dir_4h, '4h', use_class_weights=True)

print("\n" + "="*60)
print("Direction classifiers trained successfully")
print(f"  1h: Accuracy={dir_acc_1h:.1%}, F1={dir_f1_1h:.3f}")
print(f"  4h: Accuracy={dir_acc_4h:.1%}, F1={dir_f1_4h:.3f}")
print("="*60)

In [None]:
# Train Spike Detectors
from sklearn.ensemble import GradientBoostingClassifier

print("\n" + "="*60)
print("TRAINING SPIKE DETECTORS")
print("="*60)

def train_spike_detector(X, y_target, current_gas, horizon_name):
    """Train spike classification model"""
    print(f"\nTraining {horizon_name} spike detector...")
    
    # Classify based on relative change from current
    price_change_pct = (y_target - current_gas) / (current_gas + 1e-8)
    
    # Normal: < 50% change, Elevated: 50-100%, Spike: > 100%
    def classify(pct):
        pct = abs(pct)
        if pct < 0.5:
            return 0  # Normal
        elif pct < 1.0:
            return 1  # Elevated
        else:
            return 2  # Spike
    
    y_class = price_change_pct.apply(classify)
    
    # Class distribution
    class_counts = y_class.value_counts().sort_index()
    print(f"  Classes: Normal={class_counts.get(0,0)}, Elevated={class_counts.get(1,0)}, Spike={class_counts.get(2,0)}")
    
    # Split (time-series)
    split_idx = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y_class.iloc[:split_idx], y_class.iloc[split_idx:]
    
    # Scale
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train
    clf = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
    clf.fit(X_train_scaled, y_train)
    
    accuracy = clf.score(X_test_scaled, y_test)
    print(f"  Accuracy: {accuracy:.1%}")
    
    return clf, scaler

spike_1h, spike_scaler_1h = train_spike_detector(X, y_1h, current_gas, '1h')
spike_4h, spike_scaler_4h = train_spike_detector(X, y_4h, current_gas, '4h')
spike_24h, spike_scaler_24h = train_spike_detector(X, y_24h, current_gas, '24h')

In [None]:
# Save all models with feature importance and confidence info
import os
from datetime import datetime

os.makedirs('saved_models', exist_ok=True)

print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

# Get feature importance from best model if available
feature_importance = {}
try:
    # Try to get from 4h model (usually RandomForest)
    for name, model, metrics, scaler in all_4h:
        if hasattr(model, 'feature_importances_'):
            feature_importance = dict(zip(features_4h_used, model.feature_importances_))
            break
except:
    pass

# Save prediction models
for horizon, best, features_used in [('1h', best_1h, features_1h_used), 
                                      ('4h', best_4h, features_4h_used), 
                                      ('24h', best_24h, features_4h_used)]:
    name, model, metrics, scaler = best
    
    model_data = {
        'model': model,
        'model_name': name,
        'metrics': metrics,
        'trained_at': datetime.now().isoformat(),
        'feature_names': features_used,
        'feature_scaler': scaler,
        'scaler_type': 'RobustScaler',
        'is_ensemble': name == 'Ensemble',
        'training_strategy': 'simpler_regularized' if horizon == '1h' else 'full_tuned',
        'actual_horizon': '1 hour' if horizon == '1h' else '4 hours',
        'confidence_method': 'tree_variance' if hasattr(model, 'estimators_') else 'fixed'
    }
    
    # Add feature importance for relevant models
    if horizon == '4h' and feature_importance:
        model_data['feature_importance'] = feature_importance
    
    joblib.dump(model_data, f'saved_models/model_{horizon}.pkl')
    print(f"Saved model_{horizon}.pkl ({name}, MAE={metrics['mae']:.6f}, {metrics['vs_baseline']})")
    
    # Save scaler separately
    joblib.dump(scaler, f'saved_models/scaler_{horizon}.pkl')

# Save spike detectors
for horizon, (clf, scaler) in [('1h', (spike_1h, spike_scaler_1h)), 
                                ('4h', (spike_4h, spike_scaler_4h)),
                                ('24h', (spike_24h, spike_scaler_24h))]:
    spike_data = {
        'model': clf,
        'scaler': scaler,
        'trained_at': datetime.now().isoformat()
    }
    joblib.dump(spike_data, f'saved_models/spike_detector_{horizon}.pkl')
    print(f"Saved spike_detector_{horizon}.pkl")

# Save feature names (all features for compatibility)
joblib.dump(features, 'saved_models/feature_names.pkl')
print(f"Saved feature_names.pkl ({len(features)} features)")

# Save training metadata with full info
import json
metadata = {
    'training_timestamp': datetime.now().isoformat(),
    'total_samples': len(df_clean),
    'date_range': f"{df_clean.index.min()} to {df_clean.index.max()}",
    'num_segments_used': len(good_segments),
    'features': {
        'total': len(features),
        '1h_specific': len(features_1h_used),
        '4h_specific': len(features_4h_used)
    },
    'baselines': BASELINES,
    'models': {
        '1h': {
            'name': best_1h[0], 
            'r2': float(best_1h[2]['r2']), 
            'mae': float(best_1h[2]['mae']),
            'vs_baseline': best_1h[2]['vs_baseline'],
            'improvement_pct': float(best_1h[2]['improvement']),
            'actual_horizon': '1 hour',
            'training_strategy': 'simpler models with strong regularization (Ridge, ElasticNet, Huber)',
            'directional_accuracy': float(best_1h[2]['directional_accuracy'])
        },
        '4h': {
            'name': best_4h[0], 
            'r2': float(best_4h[2]['r2']), 
            'mae': float(best_4h[2]['mae']),
            'vs_baseline': best_4h[2]['vs_baseline'],
            'improvement_pct': float(best_4h[2]['improvement']),
            'actual_horizon': '4 hours',
            'training_strategy': 'full model suite with hyperparameter tuning',
            'directional_accuracy': float(best_4h[2]['directional_accuracy'])
        },
        '24h': {
            'name': best_24h[0], 
            'r2': float(best_24h[2]['r2']), 
            'mae': float(best_24h[2]['mae']),
            'vs_baseline': best_24h[2]['vs_baseline'],
            'improvement_pct': float(best_24h[2]['improvement']),
            'actual_horizon': '4 hours (labeled as 24h due to data limitations)',
            'training_strategy': 'same as 4h model',
            'directional_accuracy': float(best_24h[2]['directional_accuracy'])
        }
    },
    'direction_models': {
        '1h': {
            'accuracy': float(dir_acc_1h),
            'f1_score': float(dir_f1_1h)
        },
        '4h': {
            'accuracy': float(dir_acc_4h),
            'f1_score': float(dir_f1_4h)
        }
    },
    'improvements_applied': [
        'Micro-features (5min, 15min, 30min) for 1h prediction',
        'Time features (hour, day, peak hours)',
        'Simpler models (Ridge, ElasticNet, Huber) for 1h',
        'Hyperparameter tuning (RandomizedSearchCV) for 4h',
        'Class-weighted direction classification',
        'Confidence scoring based on tree variance',
        'Percentage-based direction thresholds (2%)',
        'Drift baseline comparison',
        'Stricter correlation filtering (0.90)'
    ]
}

with open('saved_models/training_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"Saved training_metadata.json")

# Save feature importance separately
if feature_importance:
    with open('saved_models/feature_importance.json', 'w') as f:
        # Sort by importance
        sorted_importance = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True))
        json.dump(sorted_importance, f, indent=2)
    print(f"Saved feature_importance.json (top features: {list(sorted_importance.keys())[:5]})")

In [None]:
# Print final report
print("\n" + "="*70)
print("TRAINING COMPLETE - FINAL REPORT")
print("="*70)

print(f"\nDATA SUMMARY")
print(f"   Total samples used: {len(df_clean):,}")
print(f"   Segments combined: {len(good_segments)}")
print(f"   Features (all): {len(features)}")
print(f"   Features (1h): {len(features_1h_used)}")
print(f"   Features (4h): {len(features_4h_used)}")
print(f"   Date range: {df_clean.index.min()} to {df_clean.index.max()}")

print(f"\nBASELINE COMPARISON")
print(f"   1h Baselines:")
print(f"      Naive (current price): {BASELINES['1h']['naive_mae']:.6f}")
print(f"      Mean (average):        {BASELINES['1h']['mean_mae']:.6f}")
print(f"      Drift (trend):         {BASELINES['1h']['drift_mae']:.6f}")
print(f"      Best baseline:         {BASELINES['1h']['best']:.6f}")
print(f"   4h Baselines:")
print(f"      Naive (current price): {BASELINES['4h']['naive_mae']:.6f}")
print(f"      Mean (average):        {BASELINES['4h']['mean_mae']:.6f}")
print(f"      Best baseline:         {BASELINES['4h']['best']:.6f}")

print(f"\n" + "-"*70)
print(f"{'PRICE PREDICTION MODELS':^70}")
print("-"*70)
print(f"{'Horizon':<8} {'Model':<18} {'MAE':>10} {'R²':>8} {'vs Baseline':>13} {'Dir Acc':>8}")
print("-"*70)

for horizon, best in [('1h', best_1h), ('4h', best_4h), ('24h*', best_24h)]:
    name = best[0][:17]
    metrics = best[2]
    print(f"{horizon:<8} {name:<18} {metrics['mae']:>10.6f} {metrics['r2']:>8.4f} {metrics['vs_baseline']:>13} {metrics['directional_accuracy']:>7.1%}")

print("-"*70)
print("* 24h model = 4h model (insufficient data for true 24h prediction)")

print(f"\n" + "-"*70)
print(f"{'DIRECTION CLASSIFICATION':^70}")
print("-"*70)
print(f"{'Horizon':<8} {'Accuracy':>10} {'F1 Score':>10} {'Classes':<30}")
print("-"*70)
print(f"{'1h':<8} {dir_acc_1h:>9.1%} {dir_f1_1h:>10.3f} {'Down / Stable / Up':<30}")
print(f"{'4h':<8} {dir_acc_4h:>9.1%} {dir_f1_4h:>10.3f} {'Down / Stable / Up':<30}")
print("-"*70)

print(f"\nKEY INSIGHTS")
# 1h model
if best_1h[2]['improvement'] > 0:
    print(f"   1h: ML beats baseline by {best_1h[2]['improvement']:.1f}% - model is learning!")
else:
    print(f"   1h: ML is {abs(best_1h[2]['improvement']):.1f}% worse than baseline")
    print(f"       Short-term gas prices are very noisy. Need more data.")

# 4h model
if best_4h[2]['improvement'] > 0:
    print(f"   4h: ML beats baseline by {best_4h[2]['improvement']:.1f}% - good performance!")
else:
    print(f"   4h: ML is {abs(best_4h[2]['improvement']):.1f}% worse than baseline")

# Direction
if dir_acc_1h > 0.4:
    print(f"   Direction 1h: {dir_acc_1h:.1%} accuracy (>40% = useful signal)")
if dir_acc_4h > 0.4:
    print(f"   Direction 4h: {dir_acc_4h:.1%} accuracy (>40% = useful signal)")

print(f"\nIMPROVEMENTS APPLIED")
print(f"   - Micro-features (5/15/30min windows) for 1h prediction")
print(f"   - Simpler models (Ridge, ElasticNet, Huber) for noisy 1h data")
print(f"   - Hyperparameter tuning (RandomizedSearchCV) for 4h")
print(f"   - Class-weighted direction classification")
print(f"   - Peak hours time features (14:00-22:00 UTC)")
print(f"   - Acceleration features (rate of change of momentum)")
print(f"   - Confidence scoring based on tree variance")

print(f"\nFILES SAVED")
print(f"   - model_1h.pkl, model_4h.pkl, model_24h.pkl (prediction models)")
print(f"   - scaler_1h.pkl, scaler_4h.pkl, scaler_24h.pkl (feature scalers)")  
print(f"   - spike_detector_1h/4h/24h.pkl (spike classifiers)")
print(f"   - feature_names.pkl ({len(features)} features)")
print(f"   - feature_importance.json (sorted by importance)")
print(f"   - training_metadata.json (full training report)")

print("\n" + "="*70)

In [None]:
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Gweizy Model Training Results', fontsize=14, fontweight='bold')

# 1. Actual vs Predicted (1h)
ax1 = axes[0, 0]
ax1.scatter(actual_1h.values, pred_1h, alpha=0.5, s=10)
ax1.plot([actual_1h.min(), actual_1h.max()], [actual_1h.min(), actual_1h.max()], 'r--', label='Perfect')
ax1.set_xlabel('Actual Gas Price')
ax1.set_ylabel('Predicted')
ax1.set_title(f'1h Prediction (R²={best_1h[2]["r2"]:.3f})')
ax1.legend()

# 2. Actual vs Predicted (4h)
ax2 = axes[0, 1]
ax2.scatter(actual_4h.values, pred_4h, alpha=0.5, s=10)
ax2.plot([actual_4h.min(), actual_4h.max()], [actual_4h.min(), actual_4h.max()], 'r--', label='Perfect')
ax2.set_xlabel('Actual Gas Price')
ax2.set_ylabel('Predicted')
ax2.set_title(f'4h Prediction (R²={best_4h[2]["r2"]:.3f})')
ax2.legend()

# 3. Model Comparison (MAE)
ax3 = axes[0, 2]
models_1h = [r[0] for r in all_1h]
maes_1h = [r[2]['mae'] for r in all_1h]
colors = ['green' if m < BASELINES['1h']['best'] else 'red' for m in maes_1h]
bars = ax3.barh(models_1h, maes_1h, color=colors, alpha=0.7)
ax3.axvline(BASELINES['1h']['best'], color='blue', linestyle='--', label=f'Baseline: {BASELINES["1h"]["best"]:.4f}')
ax3.set_xlabel('MAE')
ax3.set_title('1h Model Comparison')
ax3.legend()

# 4. Residuals Distribution (1h)
ax4 = axes[1, 0]
residuals_1h = actual_1h.values - pred_1h
ax4.hist(residuals_1h, bins=50, alpha=0.7, edgecolor='black')
ax4.axvline(0, color='red', linestyle='--')
ax4.set_xlabel('Residual (Actual - Predicted)')
ax4.set_ylabel('Frequency')
ax4.set_title(f'1h Residuals (mean={np.mean(residuals_1h):.4f})')

# 5. Time Series Sample
ax5 = axes[1, 1]
sample_size = min(200, len(actual_1h))
ax5.plot(range(sample_size), actual_1h.values[:sample_size], label='Actual', alpha=0.8)
ax5.plot(range(sample_size), pred_1h[:sample_size], label='Predicted', alpha=0.8)
ax5.set_xlabel('Time (samples)')
ax5.set_ylabel('Gas Price')
ax5.set_title('1h: Actual vs Predicted (Time Series)')
ax5.legend()

# 6. Feature Importance (top 10)
ax6 = axes[1, 2]
if feature_importance:
    sorted_imp = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
    ax6.barh(list(sorted_imp.keys()), list(sorted_imp.values()), color='steelblue')
    ax6.set_xlabel('Importance')
    ax6.set_title('Top 10 Features (4h model)')
else:
    ax6.text(0.5, 0.5, 'Feature importance\nnot available', ha='center', va='center')
    ax6.set_title('Feature Importance')

plt.tight_layout()
plt.savefig('saved_models/training_results.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved training_results.png")

In [None]:
# Create zip file for download
import shutil

shutil.make_archive('gweizy_models', 'zip', 'saved_models')
print("\n✅ Created gweizy_models.zip")
print("\nDownload this file and extract to: backend/models/saved_models/")

# Auto-download
files.download('gweizy_models.zip')