# Base Gas Fee Prediction - Model Training
## LSTM + Prophet Model Training with GPU Acceleration

This notebook trains time-series models for gas fee prediction using 6 months of historical data.

**Hardware Recommendation**: Use GPU runtime (T4 or better)
- Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU

## 1. Setup Environment

In [None]:
# Install required packages
!pip install -q tensorflow prophet scikit-learn pandas numpy

In [None]:
# Check GPU availability
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU Available:", len(tf.config.list_physical_devices('GPU')) > 0)
print("GPU Devices:", tf.config.list_physical_devices('GPU'))

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from prophet import Prophet
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import json
import warnings
warnings.filterwarnings('ignore')

print("‚úì All imports successful")

## 2. Upload Database

**IMPORTANT**: Upload your `gas_data.db` file from:
`/Users/rodan/Documents/gasFeesPrediction-main/backend/gas_data.db`

Click the folder icon on the left ‚Üí Upload button

In [None]:
from google.colab import files
import os
import sqlite3

# Check if database exists
db_file = 'gas_data.db'

if not os.path.exists(db_file):
    print("‚ö†Ô∏è  Database not found!")
    print("\nPlease upload 'gas_data.db' from your local machine:")
    print("/Users/rodan/Documents/gasFeesPrediction-main/backend/gas_data.db")
    print("\nUploading now...")
    uploaded = files.upload()
    
    # Check if uploaded file needs renaming
    if 'gas_data.db' not in uploaded and len(uploaded) > 0:
        uploaded_name = list(uploaded.keys())[0]
        if uploaded_name.endswith('.db'):
            os.rename(uploaded_name, 'gas_data.db')
            print(f"‚úì Renamed {uploaded_name} to gas_data.db")
else:
    print("‚úì Database file found")

# Verify database
try:
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    
    # Check tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print(f"\n‚úì Database tables: {[t[0] for t in tables]}")
    
    # Check record count
    cursor.execute("SELECT COUNT(*) FROM gas_prices")
    count = cursor.fetchone()[0]
    print(f"‚úì Total gas price records: {count:,}")
    
    cursor.execute("SELECT MIN(timestamp), MAX(timestamp) FROM gas_prices")
    date_range = cursor.fetchone()
    print(f"‚úì Date range: {date_range[0]} to {date_range[1]}")
    
    conn.close()
    print("\n‚úì Database verified successfully!")
except Exception as e:
    print(f"\n‚ùå Error verifying database: {e}")
    print("Please make sure you uploaded the correct gas_data.db file")

## 3. Load and Prepare Data

In [None]:
# Connect to database
conn = sqlite3.connect('gas_data.db')

# Load historical gas prices
query = """
SELECT timestamp, current_gas as gas_price, block_number
FROM gas_prices
ORDER BY timestamp ASC
"""

df = pd.read_sql_query(query, conn)
df['timestamp'] = pd.to_datetime(df['timestamp'])
conn.close()

print(f"‚úì Loaded {len(df):,} records")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Duration: {(df['timestamp'].max() - df['timestamp'].min()).days} days")
print(f"\nGas price stats:")
print(df['gas_price'].describe())

In [None]:
# Create features
def create_features(df):
    df = df.copy()
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['day_of_month'] = df['timestamp'].dt.day
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Rolling statistics
    for window in [12, 24, 48, 168]:  # 1h, 2h, 4h, 1week (5-min intervals)
        df[f'rolling_mean_{window}'] = df['gas_price'].rolling(window=window, min_periods=1).mean()
        df[f'rolling_std_{window}'] = df['gas_price'].rolling(window=window, min_periods=1).std().fillna(0)
        df[f'rolling_min_{window}'] = df['gas_price'].rolling(window=window, min_periods=1).min()
        df[f'rolling_max_{window}'] = df['gas_price'].rolling(window=window, min_periods=1).max()
    
    # Lag features
    for lag in [1, 2, 3, 6, 12, 24]:
        df[f'lag_{lag}'] = df['gas_price'].shift(lag).fillna(df['gas_price'].mean())
    
    return df

df = create_features(df)
print(f"‚úì Created {len(df.columns)} features")
print(f"Features: {list(df.columns)}")

## 4. Train-Test Split

In [None]:
# Use 80% for training, 20% for testing
split_idx = int(len(df) * 0.8)
train_df = df[:split_idx].copy()
test_df = df[split_idx:].copy()

print(f"‚úì Training set: {len(train_df):,} records ({train_df['timestamp'].min()} to {train_df['timestamp'].max()})")
print(f"‚úì Test set: {len(test_df):,} records ({test_df['timestamp'].min()} to {test_df['timestamp'].max()})")

## 5. LSTM Model Training

In [None]:
def prepare_lstm_data(df, feature_cols, lookback=24, horizon=12):
    """Prepare sequences for LSTM training"""
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df[feature_cols])
    
    X, y = [], []
    for i in range(lookback, len(scaled_features) - horizon):
        X.append(scaled_features[i-lookback:i])
        y.append(df['gas_price'].iloc[i + horizon])
    
    return np.array(X), np.array(y), scaler

def build_lstm_model(input_shape):
    """Build Bidirectional LSTM model"""
    model = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(128, return_sequences=True),
            input_shape=input_shape
        ),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(64, return_sequences=True)
        ),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(32)
        ),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

print("‚úì LSTM functions defined")

In [None]:
# Feature columns for LSTM
feature_cols = [col for col in df.columns if col not in ['timestamp', 'block_number']]

# Training parameters
horizons = {
    '1h': 12,   # 12 * 5min = 1 hour
    '4h': 48,   # 48 * 5min = 4 hours
    '24h': 288  # 288 * 5min = 24 hours
}

lstm_models = {}
lstm_scalers = {}
lstm_results = {}

for horizon_name, horizon_steps in horizons.items():
    print(f"\n{'='*60}")
    print(f"Training LSTM for {horizon_name} horizon ({horizon_steps} steps)")
    print(f"{'='*60}\n")
    
    # Prepare data
    X_train, y_train, scaler = prepare_lstm_data(
        train_df, feature_cols, lookback=24, horizon=horizon_steps
    )
    X_test, y_test, _ = prepare_lstm_data(
        test_df, feature_cols, lookback=24, horizon=horizon_steps
    )
    
    print(f"Training samples: {len(X_train):,}")
    print(f"Test samples: {len(X_test):,}")
    print(f"Input shape: {X_train.shape}")
    
    # Build and train model
    model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
    
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6
    )
    
    print("\nTraining LSTM model...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Evaluate
    y_pred = model.predict(X_test, verbose=0).flatten()
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Directional accuracy
    actual_direction = np.diff(y_test) > 0
    pred_direction = np.diff(y_pred) > 0
    directional_accuracy = np.mean(actual_direction == pred_direction)
    
    lstm_results[horizon_name] = {
        'mae': float(mae),
        'rmse': float(rmse),
        'r2_score': float(r2),
        'directional_accuracy': float(directional_accuracy)
    }
    
    print(f"\n‚úì LSTM {horizon_name} Results:")
    print(f"  MAE: {mae:.6f}")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  R¬≤ Score: {r2:.4f} ({r2*100:.1f}%)")
    print(f"  Directional Accuracy: {directional_accuracy:.4f} ({directional_accuracy*100:.1f}%)")
    
    # Store model and scaler
    lstm_models[horizon_name] = model
    lstm_scalers[horizon_name] = scaler

print(f"\n{'='*60}")
print("‚úì All LSTM models trained successfully")
print(f"{'='*60}")

## 6. Prophet Model Training

In [None]:
prophet_models = {}
prophet_results = {}

for horizon_name, horizon_steps in horizons.items():
    print(f"\n{'='*60}")
    print(f"Training Prophet for {horizon_name} horizon")
    print(f"{'='*60}\n")
    
    # Prepare data for Prophet
    prophet_df = train_df[['timestamp', 'gas_price']].copy()
    prophet_df.columns = ['ds', 'y']
    
    # Initialize and train Prophet
    model = Prophet(
        changepoint_prior_scale=0.05,
        seasonality_prior_scale=10.0,
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=False
    )
    
    print("Training Prophet model...")
    model.fit(prophet_df)
    
    # Make predictions on test set
    test_prophet = test_df[['timestamp', 'gas_price']].copy()
    test_prophet.columns = ['ds', 'y']
    
    forecast = model.predict(test_prophet[['ds']])
    y_pred = forecast['yhat'].values
    y_test = test_prophet['y'].values
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Directional accuracy
    actual_direction = np.diff(y_test) > 0
    pred_direction = np.diff(y_pred) > 0
    directional_accuracy = np.mean(actual_direction == pred_direction)
    
    prophet_results[horizon_name] = {
        'mae': float(mae),
        'rmse': float(rmse),
        'r2_score': float(r2),
        'directional_accuracy': float(directional_accuracy)
    }
    
    print(f"\n‚úì Prophet {horizon_name} Results:")
    print(f"  MAE: {mae:.6f}")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  R¬≤ Score: {r2:.4f} ({r2*100:.1f}%)")
    print(f"  Directional Accuracy: {directional_accuracy:.4f} ({directional_accuracy*100:.1f}%)")
    
    prophet_models[horizon_name] = model

print(f"\n{'='*60}")
print("‚úì All Prophet models trained successfully")
print(f"{'='*60}")

## 7. Results Summary

In [None]:
# Create results summary
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

for horizon_name in horizons.keys():
    print(f"\n{horizon_name.upper()} HORIZON:")
    print("-" * 80)
    
    lstm_res = lstm_results[horizon_name]
    prophet_res = prophet_results[horizon_name]
    
    print(f"\nLSTM:")
    print(f"  MAE:                  {lstm_res['mae']:.6f}")
    print(f"  RMSE:                 {lstm_res['rmse']:.6f}")
    print(f"  R¬≤ Score:             {lstm_res['r2_score']:.4f} ({lstm_res['r2_score']*100:.1f}%)")
    print(f"  Directional Accuracy: {lstm_res['directional_accuracy']:.4f} ({lstm_res['directional_accuracy']*100:.1f}%)")
    
    print(f"\nProphet:")
    print(f"  MAE:                  {prophet_res['mae']:.6f}")
    print(f"  RMSE:                 {prophet_res['rmse']:.6f}")
    print(f"  R¬≤ Score:             {prophet_res['r2_score']:.4f} ({prophet_res['r2_score']*100:.1f}%)")
    print(f"  Directional Accuracy: {prophet_res['directional_accuracy']:.4f} ({prophet_res['directional_accuracy']*100:.1f}%)")
    
    # Determine winner
    lstm_score = (lstm_res['r2_score'] + lstm_res['directional_accuracy']) / 2
    prophet_score = (prophet_res['r2_score'] + prophet_res['directional_accuracy']) / 2
    
    winner = "LSTM" if lstm_score > prophet_score else "Prophet"
    print(f"\n  üèÜ Best Model: {winner}")

print("\n" + "="*80)

## 8. Save Models

In [None]:
import os

# Create models directory
os.makedirs('models', exist_ok=True)

# Save LSTM models
for horizon_name, model in lstm_models.items():
    model_path = f'models/lstm_{horizon_name}.keras'
    model.save(model_path)
    print(f"‚úì Saved LSTM {horizon_name} to {model_path}")
    
    # Save scaler
    scaler_path = f'models/lstm_{horizon_name}_scaler.pkl'
    with open(scaler_path, 'wb') as f:
        pickle.dump(lstm_scalers[horizon_name], f)
    print(f"‚úì Saved scaler to {scaler_path}")

# Save Prophet models
for horizon_name, model in prophet_models.items():
    model_path = f'models/prophet_{horizon_name}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úì Saved Prophet {horizon_name} to {model_path}")

# Save results
results = {
    'lstm': lstm_results,
    'prophet': prophet_results,
    'training_date': datetime.now().isoformat(),
    'training_records': len(train_df),
    'test_records': len(test_df)
}

with open('models/training_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print(f"\n‚úì Saved training results to models/training_results.json")

print("\n" + "="*60)
print("‚úì All models saved successfully!")
print("="*60)

## 9. Download Models

Download all trained models and results to your local machine:

In [None]:
# Zip all models
!zip -r trained_models.zip models/

# Download
from google.colab import files
files.download('trained_models.zip')

print("\n‚úì Models packaged and downloaded!")
print("\nExtract the zip file and copy the models/ directory to:")
print("/Users/rodan/Documents/gasFeesPrediction-main/backend/models/")