# 🎯 Gweizy Model Training Notebook

Train ML models for gas price prediction on Google Colab.

**Steps:**
1. Upload your `gas_data.db` file from `backend/gas_data.db`
2. Run all cells (Runtime → Run all)
3. Download trained models zip
4. Extract and copy to `backend/models/saved_models/`
5. Commit and push to deploy!

## 1️⃣ Install Dependencies

In [1]:
%pip install pandas numpy scikit-learn joblib lightgbm sqlalchemy python-dateutil tqdm -q
print("✅ Dependencies installed!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
✅ Dependencies installed!


In [None]:
# Download database directly from Railway APIimport urllib.requestimport osimport timeif not os.path.exists('gas_data.db'):    print("📥 Downloading database from Railway API...")    print("   This may take a minute (file is ~53 MB with 234k records)")    start_time = time.time()    try:        url = "https://basegasfeesml-production.up.railway.app/api/database/download"                # Show progress        def show_progress(block_num, block_size, total_size):            downloaded = block_num * block_size            percent = (downloaded / total_size * 100) if total_size > 0 else 0            mb_downloaded = downloaded / (1024 * 1024)            mb_total = total_size / (1024 * 1024) if total_size > 0 else 0            elapsed = time.time() - start_time            speed = mb_downloaded / elapsed if elapsed > 0 else 0            print(f"\r   Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB) @ {speed:.1f} MB/s", end='', flush=True)                urllib.request.urlretrieve(url, "gas_data.db", reporthook=show_progress)        print()  # New line after progress                file_size_mb = os.path.getsize('gas_data.db') / (1024 * 1024)        elapsed = time.time() - start_time        print(f"✅ Database downloaded! Size: {file_size_mb:.2f} MB (took {elapsed:.1f}s)")                # Verify it's a valid SQLite database        try:            import sqlite3            conn = sqlite3.connect('gas_data.db')            cursor = conn.cursor()            cursor.execute("SELECT COUNT(*) FROM gas_prices")            count = cursor.fetchone()[0]            conn.close()            print(f"✅ Verified: {count:,} records in database")        except Exception as e:            print(f"⚠️  Warning: Could not verify database: {e}")                except Exception as e:        print(f"\n❌ Failed to download: {e}")        print("   Please try:")        print("   1. Check your internet connection")        print("   2. Try again in a few moments")        print("   3. Or upload manually using the cell below")        raiseelse:    file_size_mb = os.path.getsize('gas_data.db') / (1024 * 1024)    print(f"✅ Database already exists! Size: {file_size_mb:.2f} MB")        # Show record count    try:        import sqlite3        conn = sqlite3.connect('gas_data.db')        cursor = conn.cursor()        cursor.execute("SELECT COUNT(*) FROM gas_prices")        count = cursor.fetchone()[0]        conn.close()        print(f"   Records: {count:,}")    except:        pass

## 2️⃣ Upload Database

Upload your `gas_data.db` file from `backend/gas_data.db`

In [None]:
from google.colab import files
import os

# Check if database already exists
if not os.path.exists('gas_data.db'):
    print("📁 Please upload your gas_data.db file from Railway:")
    print("\n💡 How to get the database from Railway:")
    print("   1. Go to Railway dashboard → Your service")
    print("   2. Open the volume/file browser")
    print("   3. Navigate to backend/gas_data.db")
    print("   4. Download the file (should be several MB for 234k records)")
    print("   5. Upload it here:")
    uploaded = files.upload()
    print(f"✅ Uploaded: {list(uploaded.keys())}")
else:
    print("✅ Database already exists!")

# Check file size
if os.path.exists('gas_data.db'):
    file_size_mb = os.path.getsize('gas_data.db') / (1024 * 1024)
    print(f"\n📊 Database file size: {file_size_mb:.2f} MB")
    if file_size_mb < 1:
        print("⚠️ WARNING: File is very small (<1 MB)")
        print("   This suggests the database might be empty or outdated")
        print("   Expected size: ~5-20 MB for 234,000 records")
    elif file_size_mb < 5:
        print("⚠️ File size is smaller than expected")
        print("   Expected: ~5-20 MB for 234,000 records")
        print("   You might have an older database file")
    else:
        print("✅ File size looks reasonable")

ModuleNotFoundError: No module named 'google.colab'

## 3️⃣ Load and Inspect Data

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import datetime, timedelta
import time
import warnings
warnings.filterwarnings('ignore')

start_time = time.time()
def log(msg):
    elapsed = time.time() - start_time
    print(f"[{elapsed:6.1f}s] {msg}")

# Connect to database
engine = create_engine('sqlite:///gas_data.db')

# Load data (note: column is 'current_gas' not 'gas_price')
query = """
SELECT timestamp, current_gas, block_number, base_fee, priority_fee
FROM gas_prices
ORDER BY timestamp DESC
"""

df = pd.read_sql(query, engine)
# Rename to gas_price for consistency with feature engineering code
df = df.rename(columns={'current_gas': 'gas_price'})

log(f"📊 Loaded {len(df):,} records")
log(f"📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
log(f"⛽ Gas price range: {df['gas_price'].min():.6f} to {df['gas_price'].max():.6f} gwei")

# Check database size
if len(df) < 1000:
    log(f"⚠️ WARNING: Only {len(df):,} records loaded!")
    log(f"   Expected ~234,000 records from Railway")
    log(f"   💡 This suggests:")
    log(f"      - Wrong database file uploaded")
    log(f"      - Database file is outdated")
    log(f"      - Database file path is incorrect")
    log(f"   💡 To fix:")
    log(f"      1. Download the latest gas_data.db from Railway")
    log(f"      2. Make sure you're uploading from backend/gas_data.db")
    log(f"      3. Check file size - should be several MB for 234k records")
else:
    log(f"✅ Dataset size looks good: {len(df):,} records")

# Show sample
df.head()

## 4️⃣ Feature Engineering

In [None]:
log("🔧 Starting feature engineering...")

# Helper function for smart NaN handling
def smart_fill(series, method='ffill'):
    """Fill NaN with forward fill, then backward fill, then mean"""
    filled = series.copy()
    if method == 'ffill':
        filled = filled.ffill().bfill()
    else:
        filled = filled.bfill().ffill()
    if filled.isna().any():
        filled = filled.fillna(series.mean() if series.notna().any() else 0)
    return filled

# Sort by timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# Sample if too large (use all data on Colab - we have resources!)
MAX_RECORDS = 100000  # Can handle more on Colab
if len(df) > MAX_RECORDS:
    log(f"⚠️ Sampling {MAX_RECORDS:,} from {len(df):,} records")
    recent = df.tail(MAX_RECORDS // 5)
    older = df.head(len(df) - MAX_RECORDS // 5).sample(MAX_RECORDS - len(recent), random_state=42)
    df = pd.concat([older, recent]).sort_values('timestamp').reset_index(drop=True)
    log(f"✅ Using {len(df):,} records")

# Outlier capping
Q1, Q3 = df['gas_price'].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 3*IQR, Q3 + 3*IQR
outliers = ((df['gas_price'] < lower) | (df['gas_price'] > upper)).sum()
log(f"⚠️ Capping {outliers:,} outliers ({outliers/len(df)*100:.1f}%)")
df['gas_price'] = df['gas_price'].clip(lower, upper)

# ===================================================================
# 1. TIME FEATURES (no NaN)
# ===================================================================
log("   [1/9] Adding time features...")
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['day_of_month'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# ===================================================================
# 2. BASE FEE FEATURES (use base_fee and priority_fee)
# ===================================================================
log("   [2/9] Adding base fee and priority fee features...")
# Direct features
if 'base_fee' in df.columns and 'priority_fee' in df.columns:
    # Derived fee features
    df['total_fee'] = df['base_fee'] + df['priority_fee']
    df['base_fee_ratio'] = df['base_fee'] / (df['gas_price'] + 1e-8)
    df['priority_fee_ratio'] = df['priority_fee'] / (df['gas_price'] + 1e-8)
    df['base_fee_pct'] = (df['base_fee'] / (df['total_fee'] + 1e-8)) * 100
    
    # Rolling statistics for fees
    for window in [6, 12, 24]:
        df[f'base_fee_ma_{window}'] = df['base_fee'].rolling(window, min_periods=1).mean()
        df[f'priority_fee_ma_{window}'] = df['priority_fee'].rolling(window, min_periods=1).mean()
        df[f'total_fee_ma_{window}'] = df['total_fee'].rolling(window, min_periods=1).mean()
else:
    log("   ⚠️ base_fee or priority_fee not found, skipping fee features")

# ===================================================================
# 3. LAG FEATURES (with smart NaN handling)
# ===================================================================
log("   [3/9] Adding lag features...")
for lag in [1, 3, 6, 12, 24]:
    df[f'gas_lag_{lag}'] = smart_fill(df['gas_price'].shift(lag))

# ===================================================================
# 4. ROLLING STATISTICS (with min_periods=1)
# ===================================================================
log("   [4/9] Adding rolling statistics...")
rolling_features = []
for window in [6, 12, 24, 48]:
    rolling_features.append(pd.DataFrame({
        f'gas_ma_{window}': df['gas_price'].rolling(window, min_periods=1).mean(),
        f'gas_std_{window}': df['gas_price'].rolling(window, min_periods=1).std().fillna(0),
        f'gas_min_{window}': df['gas_price'].rolling(window, min_periods=1).min(),
        f'gas_max_{window}': df['gas_price'].rolling(window, min_periods=1).max(),
        f'gas_range_{window}': df['gas_price'].rolling(window, min_periods=1).max() - 
                              df['gas_price'].rolling(window, min_periods=1).min()
    }))
if rolling_features:
    df = pd.concat([df] + rolling_features, axis=1)

# ===================================================================
# 5. PRICE CHANGE & DERIVED FEATURES
# ===================================================================
log("   [5/9] Adding price change features...")
for period in [1, 6, 12, 24]:
    df[f'gas_pct_change_{period}'] = (df['gas_price'].pct_change(period) * 100).fillna(0)
    df[f'gas_diff_{period}'] = (df['gas_price'].diff(period)).fillna(0)

# Volatility (coefficient of variation)
for window in [6, 12, 24]:
    mean = df['gas_price'].rolling(window, min_periods=1).mean()
    std = df['gas_price'].rolling(window, min_periods=1).std()
    df[f'volatility_{window}'] = (std / (mean + 1e-8)).fillna(0)

# ===================================================================
# 6. STATISTICAL FEATURES (percentiles, skewness, kurtosis)
# ===================================================================
log("   [6/9] Adding statistical features...")
stat_features = []
for window in [12, 24]:
    stat_features.append(pd.DataFrame({
        f'q25_{window}': df['gas_price'].rolling(window, min_periods=1).quantile(0.25),
        f'q50_{window}': df['gas_price'].rolling(window, min_periods=1).quantile(0.50),
        f'q75_{window}': df['gas_price'].rolling(window, min_periods=1).quantile(0.75),
        f'skew_{window}': df['gas_price'].rolling(window, min_periods=1).skew().fillna(0),
        f'kurt_{window}': df['gas_price'].rolling(window, min_periods=1).kurt().fillna(0)
    }))
    # Distance from moving average
    ma = df['gas_price'].rolling(window, min_periods=1).mean()
    stat_features.append(pd.DataFrame({
        f'dist_from_ma_{window}': (df['gas_price'] - ma).fillna(0),
        f'dist_from_ma_pct_{window}': ((df['gas_price'] - ma) / (ma + 1e-8) * 100).fillna(0)
    }))
if stat_features:
    df = pd.concat([df] + stat_features, axis=1)

# ===================================================================
# 7. MOMENTUM INDICATORS (RSI, MACD, trend strength)
# ===================================================================
log("   [7/9] Adding momentum indicators...")

# RSI (Relative Strength Index)
def calculate_rsi(prices, period=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
    rs = gain / (loss.replace(0, np.nan) + 1e-8)
    return (100 - (100 / (1 + rs))).fillna(50)

df['rsi_12'] = calculate_rsi(df['gas_price'], 12)
df['rsi_24'] = calculate_rsi(df['gas_price'], 24)

# MACD-like features (EMA differences)
df['ema_6'] = df['gas_price'].ewm(span=6, adjust=False).mean()
df['ema_12'] = df['gas_price'].ewm(span=12, adjust=False).mean()
df['ema_24'] = df['gas_price'].ewm(span=24, adjust=False).mean()
df['macd_6_12'] = (df['ema_6'] - df['ema_12']).fillna(0)
df['macd_12_24'] = (df['ema_12'] - df['ema_24']).fillna(0)

# Trend strength (linear regression slope)
for window in [12, 24]:
    def calc_trend(x):
        if len(x) < 2:
            return 0
        try:
            return np.polyfit(np.arange(len(x)), x, 1)[0]
        except:
            return 0
    df[f'trend_strength_{window}'] = df['gas_price'].rolling(window, min_periods=2).apply(calc_trend, raw=True).fillna(0)

# Momentum
for period in [6, 12, 24]:
    df[f'momentum_{period}'] = (df['gas_price'] - df['gas_price'].shift(period)).fillna(0)

# ===================================================================
# 8. INTERACTION FEATURES
# ===================================================================
log("   [8/9] Adding interaction features...")
# Time × Price interactions
df['hour_x_gas_price'] = df['hour'] * df['gas_price']
df['weekend_x_gas_price'] = df['is_weekend'] * df['gas_price']

# Business hours indicators
df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17) & (df['is_weekend'] == 0)).astype(int)
df['is_peak_hours'] = ((df['hour'] >= 14) & (df['hour'] <= 18)).astype(int)
df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)

# Fee × Price interactions (if fees available)
if 'base_fee' in df.columns and 'priority_fee' in df.columns:
    df['base_fee_x_gas_price'] = df['base_fee'] * df['gas_price']
    df['priority_fee_x_gas_price'] = df['priority_fee'] * df['gas_price']
    df['total_fee_x_gas_price'] = df['total_fee'] * df['gas_price']

# ===================================================================
# 9. FINAL NaN CLEANUP (only critical columns)
# ===================================================================
log("   [9/9] Final cleanup...")
initial_len = len(df)
critical_cols = ['gas_price']
if 'base_fee' in df.columns:
    critical_cols.append('base_fee')
if 'priority_fee' in df.columns:
    critical_cols.append('priority_fee')

df = df.dropna(subset=critical_cols)

# Fill any remaining NaN in non-critical features with 0 or median
for col in df.columns:
    if col not in critical_cols + ['timestamp', 'block_number']:
        if df[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
            if df[col].isna().any():
                df[col] = df[col].fillna(df[col].median() if df[col].notna().any() else 0)

log(f"✅ Features created: {len(df):,} samples (dropped {initial_len - len(df):,} rows with missing critical data)")
log(f"   Total features: {len(df.columns)}")
print(f"\n📊 Feature columns ({len(df.columns)}): {list(df.columns)}")

In [None]:
# ===================================================================
# DIAGNOSTIC: NaN Analysis
# ===================================================================
log("🔍 Analyzing NaN values in features...")

nan_counts = df.isna().sum()
nan_pct = (nan_counts / len(df) * 100).round(2)

# Summary
total_nan_features = (nan_counts > 0).sum()
log(f"   Features with NaN: {total_nan_features}/{len(df.columns)}")
log(f"   Total NaN values: {nan_counts.sum():,}")

# Show features with NaN
if total_nan_features > 0:
    log("\n   ⚠️ Features with NaN values:")
    nan_features = nan_counts[nan_counts > 0].sort_values(ascending=False)
    for feat, count in nan_features.items():
        pct = nan_pct[feat]
        log(f"      • {feat}: {count:,} NaN ({pct:.1f}%)")
    
    # High NaN features (>10%)
    high_nan = nan_pct[nan_pct > 10]
    if len(high_nan) > 0:
        log(f"\n   ⚠️ Features with >10% NaN ({len(high_nan)}):")
        for feat, pct in high_nan.sort_values(ascending=False).items():
            log(f"      • {feat}: {pct:.1f}%")
else:
    log("   ✅ No NaN values found in features!")

# Data preservation stats
log(f"\n   📊 Data Preservation:")
log(f"      • Total rows: {len(df):,}")
log(f"      • Rows with all features: {(~df.isna().any(axis=1)).sum():,}")
log(f"      • Rows with any NaN: {df.isna().any(axis=1).sum():,}")
log(f"      • Data completeness: {(~df.isna().any(axis=1)).sum() / len(df) * 100:.1f}%")

print("\n" + "="*60)

## 5️⃣ Create Targets & Prepare Features

In [None]:
# This cell will run AFTER target creation (see cell below)
# It's placed here as a placeholder - the actual diagnostic runs after targets are created
pass

In [None]:
log("🎯 Creating prediction targets...")

# Estimate steps per hour from data
# Ensure timestamp is datetime
if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

time_diffs = df['timestamp'].diff().dropna()
if len(time_diffs) == 0:
    log("⚠️ No time differences found, using default: 1 step per hour")
    steps_per_hour = 1
else:
    median_interval = time_diffs.median()
    if pd.isna(median_interval):
        log("⚠️ Could not calculate median interval, using default: 1 step per hour")
        steps_per_hour = 1
    else:
        median_interval_min = median_interval.total_seconds() / 60  # minutes
        if median_interval_min <= 0 or not np.isfinite(median_interval_min):
            log("⚠️ Invalid interval, using default: 1 step per hour")
            steps_per_hour = 1
        else:
            steps_per_hour = max(1, int(60 / median_interval_min))
            log(f"   Detected {steps_per_hour} steps per hour (interval: {median_interval_min:.1f} min)")

# Future price targets - adjust based on available data
max_steps = len(df) - 10  # Leave at least 10 rows for training
log(f"   Available data: {len(df):,} rows, max shift: {max_steps} steps")

horizons = {
    '1h': min(steps_per_hour * 1, max_steps),
    '4h': min(steps_per_hour * 4, max_steps),
    '24h': min(steps_per_hour * 24, max_steps)
}

# If dataset is small, use smaller horizons
if len(df) < 100:
    log(f"   ⚠️ Small dataset detected, using reduced horizons")
    horizons = {
        '1h': min(steps_per_hour * 1, max(1, max_steps // 4)),
        '4h': min(steps_per_hour * 2, max(1, max_steps // 2)),
        '24h': min(steps_per_hour * 6, max_steps)
    }

targets = {}
for name, steps in horizons.items():
    if steps <= 0:
        log(f"   ⚠️ SKIPPING {name}: steps={steps} (too small)")
        continue
    
    future_price = df['gas_price'].shift(-steps)
    pct_change = ((future_price - df['gas_price']) / (df['gas_price'] + 1e-8)) * 100
    targets[name] = {
        'pct_change': pct_change,
        'original': future_price,
        'current': df['gas_price'].copy()
    }
    valid = (~pct_change.isna()).sum()
    log(f"   {name}: {valid:,} valid targets, steps={steps}")
    
    # Diagnostic info
    if valid == 0:
        log(f"      ⚠️ WARNING: No valid targets for {name}!")
        log(f"      Total rows: {len(df):,}, Shift steps: {steps}")
        log(f"      Future price NaN: {(future_price.isna()).sum():,}")
        log(f"      Current price NaN: {(df['gas_price'].isna()).sum():,}")

# Select feature columns
exclude_cols = ['timestamp', 'block_number']
feature_cols = [c for c in df.columns if c not in exclude_cols]
X = df[feature_cols].copy()
log(f"📊 Feature matrix: {X.shape[0]:,} samples, {X.shape[1]} features")

## 6️⃣ Train Models

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Create output directory
os.makedirs('trained_models', exist_ok=True)

results = {}

for horizon in ['1h', '4h', '24h']:
    log(f"\n{'='*60}")
    log(f"🌲 Training model for {horizon} horizon")
    log(f"{'='*60}")
    
    # Check if this horizon was created
    if horizon not in targets:
        log(f"   ⚠️ SKIPPING: {horizon} horizon not available (likely skipped during target creation)")
        continue
    
    y_pct = targets[horizon]['pct_change']
    y_orig = targets[horizon]['original']
    current_prices = targets[horizon]['current']
    
    # Remove NaN
    valid_idx = ~(X.isna().any(axis=1) | y_pct.isna() | y_orig.isna())
    X_clean = X[valid_idx]
    y_pct_clean = y_pct[valid_idx]
    y_orig_clean = y_orig[valid_idx]
    current_clean = current_prices[valid_idx]
    
    log(f"   Valid samples: {len(X_clean):,}")
    
    # Check if we have enough data - MUST check before proceeding
    # Reduced minimum for smaller datasets
    MIN_SAMPLES = 50  # Minimum samples needed for training (reduced from 100)
    if len(X_clean) == 0:
        log(f"   ⚠️ SKIPPING: Zero valid samples for {horizon} horizon")
        log(f"   💡 This means all rows have NaN in features or targets")
        log(f"   💡 Check the diagnostic messages above for details")
        continue
    elif len(X_clean) < MIN_SAMPLES:
        log(f"   ⚠️ SKIPPING: Only {len(X_clean):,} valid samples (need at least {MIN_SAMPLES})")
        log(f"   💡 This might be because:")
        log(f"      - Not enough historical data for {horizon} horizon")
        log(f"      - Too many NaN values after feature engineering")
        log(f"      - Data gaps in timestamp sequence")
        continue
    
    # Train/test split (80/20, temporal)
    split_idx = int(len(X_clean) * 0.8)
    X_train, X_test = X_clean.iloc[:split_idx], X_clean.iloc[split_idx:]
    y_train, y_test = y_pct_clean.iloc[:split_idx], y_pct_clean.iloc[split_idx:]
    y_orig_test = y_orig_clean.iloc[split_idx:]
    current_test = current_clean.iloc[split_idx:]
    
    log(f"   Train: {len(X_train):,}, Test: {len(X_test):,}")
    
    # Additional check after split - MUST check before scaling
    if len(X_train) == 0 or len(X_test) == 0:
        log(f"   ⚠️ SKIPPING: Insufficient samples after split (train={len(X_train):,}, test={len(X_test):,})")
        log(f"   💡 Need at least 1 sample in both train and test sets")
        continue
    
    # Final safety check before scaling
    if X_train.empty or X_test.empty:
        log(f"   ⚠️ SKIPPING: Empty dataframes detected")
        continue
    
    # Scale features
    try:
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    except ValueError as e:
        log(f"   ⚠️ SKIPPING: Error during scaling: {e}")
        continue
    
    # Train RandomForest
    log(f"   Training RandomForest (this may take 1-2 min)...")
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred_pct = model.predict(X_test_scaled)
    y_pred_orig = current_test.values * (1 + y_pred_pct / 100)
    
    # Metrics
    mae = mean_absolute_error(y_orig_test, y_pred_orig)
    rmse = np.sqrt(mean_squared_error(y_orig_test, y_pred_orig))
    r2 = r2_score(y_orig_test, y_pred_orig)
    mape = np.mean(np.abs((y_orig_test - y_pred_orig) / (y_orig_test + 1e-8))) * 100
    
    # Directional accuracy
    y_diff_actual = np.diff(y_orig_test.values)
    y_diff_pred = np.diff(y_pred_orig)
    dir_acc = np.mean(np.sign(y_diff_actual) == np.sign(y_diff_pred))
    
    log(f"   ✅ R²: {r2:.4f}")
    log(f"   ✅ MAE: {mae:.6f} gwei")
    log(f"   ✅ MAPE: {mape:.2f}%")
    log(f"   ✅ Directional Accuracy: {dir_acc*100:.1f}%")
    
    # Feature importance
    importances = model.feature_importances_
    top_features = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)[:5]
    log(f"   Top features: {[f[0] for f in top_features]}")
    
    # Save model
    model_data = {
        'model': model,
        'model_name': 'RandomForest_PctChange',
        'feature_scaler': scaler,
        'feature_names': feature_cols,
        'predicts_percentage_change': True,
        'uses_log_scale': False,
        'metrics': {'mae': mae, 'rmse': rmse, 'r2': r2, 'mape': mape, 'directional_accuracy': dir_acc},
        'trained_at': datetime.now().isoformat(),
        'training_samples': len(X_train),
        'feature_importances': dict(zip(feature_cols, importances))
    }
    
    model_path = f'trained_models/model_{horizon}.pkl'
    joblib.dump(model_data, model_path)
    log(f"   💾 Saved to {model_path}")
    
    scaler_path = f'trained_models/scaler_{horizon}.pkl'
    joblib.dump(scaler, scaler_path)
    log(f"   💾 Saved scaler to {scaler_path}")
    
    results[horizon] = model_data['metrics']

log(f"\n{'='*60}")
log("🎉 TRAINING COMPLETE!")
log(f"{'='*60}")

## 7️⃣ Summary & Download

In [None]:
import shutil

print("📊 Model Performance Summary:")
print("="*50)
if len(results) == 0:
    print("\n⚠️ No models were successfully trained!")
    print("   This usually means:")
    print("   - Not enough historical data")
    print("   - Data gaps preventing target creation")
    print("   - Too many NaN values after feature engineering")
    print("\n   💡 Try:")
    print("   - Uploading a database with more historical data")
    print("   - Checking that timestamps are properly formatted")
    print("   - Ensuring gas_price values are valid")
else:
    for horizon, metrics in results.items():
        print(f"\n{horizon}:")
        print(f"  R²: {metrics['r2']:.4f}")
        print(f"  MAE: {metrics['mae']:.6f} gwei")
        print(f"  MAPE: {metrics['mape']:.2f}%")
        print(f"  Directional Accuracy: {metrics['directional_accuracy']*100:.1f}%")

print("\n" + "="*50)
if os.path.exists('trained_models') and len(os.listdir('trained_models')) > 0:
    print("📁 Generated files:")
    for f in os.listdir('trained_models'):
        size = os.path.getsize(f'trained_models/{f}') / 1024 / 1024
        print(f"  • {f} ({size:.1f} MB)")
else:
    print("⚠️ No model files generated")

# Create zip if models exist
if os.path.exists('trained_models') and len(os.listdir('trained_models')) > 0:
    shutil.make_archive('gweizy_models', 'zip', 'trained_models')
    print("\n📦 Created gweizy_models.zip")
    
    # Download
    files.download('gweizy_models.zip')
    print("\n✅ Download started!")
    print("\n📋 Next steps:")
    print("1. Extract gweizy_models.zip")
    print("2. Copy model_*.pkl to backend/models/saved_models/")
    print("3. git add, commit, push")
    print("4. Railway will auto-deploy with new models!")
else:
    print("\n⚠️ Cannot create zip - no models were trained")
    print("   Please check your data and try again")