## Cryptocurrency Forecasting: GBR, SVR, LSTM, TCN


- Data: historical OHLCV 
- Features: technical indicators and lag features
- Validation: expanding-window walk-forward evaluation
- Models: Gradient Boosting Regressor (GBR), Support Vector Regressor (SVR), LSTM, Temporal Convolutional Network (TCN)
- Metrics: RMSE, MAE, MAPE, latency
- Artifacts: saves trained models and a registry JSON for a Streamlit app




1. Run the install cell below; ensure it completes successfully.
3. Optionally mount Google Drive or local download to retain artifacts.
4. Execute the end-to-end cell to train across models and save artifacts into `artifacts/`.
5. Download the `artifacts/` folder into this repository to use with the Streamlit app.


In [None]:
!pip -q install yfinance ta plotly scikit-learn tensorflow keras-tcn joblib pyyaml


In [None]:
#  DATA COLLECTION & INITIAL INSPECTION ENHANCEMENTS 

def comprehensive_data_inspection(df, symbol):
    """Enhanced data inspection with statistical analysis and visualizations"""
    print(f"\n DATA INSPECTION FOR {symbol} ")
    # Basic info
    print(f"Dataset shape: {df.shape}")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"Total trading days: {len(df)}")
    # Missing values analysis
    missing_analysis = df.isnull().sum()
    if missing_analysis.sum() > 0:
        print(f"\nMissing values:\n{missing_analysis}")
    # Statistical summary
    print(f"\nStatistical Summary:")
    print(df.describe())
    # Price statistics
    print(f"\nPrice Analysis:")
    print(f"Min price: ${df['close'].min():.2f}")
    print(f"Max price: ${df['close'].max():.2f}")
    print(f"Average daily return: {df['close'].pct_change().mean()*100:.4f}%")
    print(f"Volatility (std of returns): {df['close'].pct_change().std()*100:.4f}%")
    # Volume analysis
    print(f"\nVolume Analysis:")
    print(f"Average daily volume: {df['volume'].mean():,.0f}")
    print(f"Volume coefficient of variation: {df['volume'].std()/df['volume'].mean():.2f}")



In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def plot_data_quality_assessment(df, symbol, save_as=None):
    """Comprehensive data quality visualization"""
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=['Price Time Series', 'Volume Time Series', 
                       'Daily Returns Distribution', 'Volume Distribution',
                       'Price vs Volume Correlation', 'Missing Values Heatmap'],
        specs=[[{}, {}], [{}, {}], [{}, {}]]
    )
    # Price time series
    fig.add_trace(go.Scatter(x=df.index, y=df['close'], name='Close Price'), row=1, col=1)
    # Volume time series  
    fig.add_trace(go.Scatter(x=df.index, y=df['volume'], name='Volume'), row=1, col=2)
    # Returns distribution
    returns = df['close'].pct_change().dropna()
    fig.add_trace(go.Histogram(x=returns, name='Returns', nbinsx=50), row=2, col=1)
    # Volume distribution
    fig.add_trace(go.Histogram(x=df['volume'], name='Volume', nbinsx=50), row=2, col=2)
    # Price vs Volume scatter
    fig.add_trace(go.Scatter(x=df['volume'], y=df['close'], mode='markers', 
                           name='Price vs Volume', opacity=0.6), row=3, col=1)
    fig.update_layout(title=f"{symbol} - Data Quality Assessment", height=900)
    if save_as:
        save_figure(fig, save_as)
    return fig



In [None]:
import os
import json
import math
import time
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import yfinance as yf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

import joblib

warnings.filterwarnings("ignore")

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Optional: TCN
try:
    from tcn import TCN
    TCN_AVAILABLE = True
except Exception:
    TCN_AVAILABLE = False

# Optional: Refinitiv (import lazily in adapter)

# Default config
DEFAULT_SYMBOLS = ["BTC-USD", "ETH-USD"]
DEFAULT_INTERVAL = "1d"  # options: 1d, 1h, 15m, etc.
DEFAULT_START = "2018-01-01"
DEFAULT_END = None  # until latest
DEFAULT_LOOKBACK = 60
DEFAULT_HORIZON = 1  # predict next close
ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print(tf.__version__)


In [None]:
def set_global_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    tf.random.set_seed(seed)


def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)


def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))


def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denom = np.clip(np.abs(y_true), 1e-8, None)
    return float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)


class Timer:
    def __init__(self, label: str):
        self.label = label
        self.start_ts = None

    def __enter__(self):
        self.start_ts = time.time()
        return self

    def __exit__(self, exc_type, exc, tb):
        elapsed = time.time() - self.start_ts
        print(f"{self.label} took {elapsed:.2f}s")


def mount_gdrive_if_colab(mount_point: str = "/content/drive") -> Optional[str]:
    try:
        import google.colab  # noqa: F401
        from google.colab import drive
        drive.mount(mount_point)
        return mount_point
    except Exception:
        return None


In [None]:
# Data Adapters

def fetch_ohlcv_yfinance(symbol: str,
                         start: Optional[str] = DEFAULT_START,
                         end: Optional[str] = DEFAULT_END,
                         interval: str = DEFAULT_INTERVAL) -> pd.DataFrame:
    

    # Use Ticker method instead of download for more reliable results
    ticker = yf.Ticker(symbol)
    df = ticker.history(
        start=start,
        end=end,
        interval=interval,
        auto_adjust=False,
        prepost=True
    )

    # Ticker.history returns clean column names
    df = df.rename(columns={
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Close': 'close',
        'Volume': 'volume'
    })

    # Convert each column individually 
    for col in ['open', 'high', 'low', 'close', 'volume']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Clean and return
    df = df.dropna(subset=['open', 'high', 'low', 'close'])
    return df[['open', 'high', 'low', 'close', 'volume']]



In [None]:

def fetch_ohlcv_refinitiv(instrument: str,
                          start: Optional[str] = DEFAULT_START,
                          end: Optional[str] = DEFAULT_END,
                          interval: str = DEFAULT_INTERVAL) -> pd.DataFrame:
    """Fetch OHLCV using Refinitiv Data Platform if credentials are available.

    Requirements:
    - pip install refinitiv-data
    - Set environment variable RD_APP_KEY with your AppKey

    
    """
    try:
        import refinitiv.data as rd
        from refinitiv.data.content import historical_pricing as hp
    except Exception as e:
        raise RuntimeError("Refinitiv SDK not available. Install 'refinitiv-data' and configure credentials.") from e

    app_key = os.environ.get("RD_APP_KEY")
    if not app_key:
        raise RuntimeError("RD_APP_KEY env var not set for Refinitiv.")

    rd.open_session(rd.PlatformSession(app_key))
    try:
        # Map interval to Refinitiv granularity. Simplify: support daily.
        if interval not in ("1d", "1D", "daily"):
            raise NotImplementedError("Refinitiv adapter currently supports daily interval.")
        q = hp.Definition(instrument).close().fields(["OPEN","HIGH","LOW","CLOSE","VOLUME"])
        if start:
            q = q.start_date(start)
        if end:
            q = q.end_date(end)
        rsp = q.get_data()
        if rsp is None or rsp.data is None or rsp.data.empty:
            raise ValueError(f"No data from Refinitiv for {instrument}")
        df = rsp.data.copy()
        # Normalize columns
        df = df.rename(columns={
            'OPEN':'open','HIGH':'high','LOW':'low','CLOSE':'close','VOLUME':'volume'
        })
        df.index = pd.to_datetime(df.index, utc=True).tz_convert(None)
        for c in ['open','high','low','close','volume']:
            df[c] = pd.to_numeric(df[c], errors='coerce')
        df = df.dropna(subset=['open','high','low','close','volume'])
        return df[['open','high','low','close','volume']]
    finally:
        rd.close_session()


In [None]:
# Bitfinex Adapter
import requests

_BITFINEX_TIMEFRAME_MAP = {
    "1m": "1m", "5m": "5m", "15m": "15m", "30m": "30m",
    "1h": "1h", "3h": "3h", "6h": "6h", "12h": "12h",
    "1d": "1D", "1D": "1D"
}


def _to_bitfinex_symbol(symbol: str) -> str:
    # 'BTC-USD' -> 'tBTCUSD'; 'ETH-USD' -> 'tETHUSD'
    s = symbol.replace("-", "")
    if not s.startswith("t"):
        s = "t" + s
    return s


def fetch_ohlcv_bitfinex(symbol: str,
                          start: Optional[pd.Timestamp] = None,
                          end: Optional[pd.Timestamp] = None,
                          interval: str = DEFAULT_INTERVAL,
                          limit: int = 1000,
                          sort: int = 1) -> pd.DataFrame:
    """Fetch OHLCV from Bitfinex candles API.

    interval examples: '1d','1h','15m' mapped to Bitfinex timeframes.
    Returns columns: open, high, low, close, volume, indexed by timestamp.
    """
    tf = _BITFINEX_TIMEFRAME_MAP.get(interval, "1D")
    sym = _to_bitfinex_symbol(symbol)
    base = f"https://api-pub.bitfinex.com/v2/candles/trade:{tf}:{sym}/hist"
    params = {"limit": limit, "sort": sort}
    if start is not None:
        params["start"] = int(pd.Timestamp(start).value // 10**6)
    if end is not None:
        params["end"] = int(pd.Timestamp(end).value // 10**6)
    headers = {"accept": "application/json"}
    r = requests.get(base, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    if not data:
        raise ValueError(f"No data returned from Bitfinex for {symbol}")
    # Each row: [MTS, OPEN, CLOSE, HIGH, LOW, VOLUME]
    arr = np.array(data)
    # Ensure ascending by timestamp
    arr = arr[np.argsort(arr[:, 0])]
    mts = pd.to_datetime(arr[:, 0], unit='ms')
    df = pd.DataFrame({
        'open': arr[:, 1],
        'close': arr[:, 2],
        'high': arr[:, 3],
        'low': arr[:, 4],
        'volume': arr[:, 5]
    }, index=mts)
    df.index = pd.to_datetime(df.index, utc=True).tz_convert(None)
    # Reorder to standard column order
    df = df[['open','high','low','close','volume']]
    df = df.apply(pd.to_numeric, errors='coerce').dropna()
    return df



In [None]:
# Feature Engineering

from pandas.api.types import is_numeric_dtype


def compute_rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()
    gain = (delta.clip(lower=0)).rolling(window).mean()
    loss = (-delta.clip(upper=0)).rolling(window).mean()
    rs = gain / (loss + 1e-9)
    rsi = 100 - (100 / (1 + rs))
    return rsi


def add_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out['ret'] = out['close'].pct_change()
    out['log_ret'] = np.log(out['close']).diff()

    out['sma_10'] = out['close'].rolling(10).mean()
    out['sma_20'] = out['close'].rolling(20).mean()
    out['ema_10'] = out['close'].ewm(span=10, adjust=False).mean()

    out['rsi_14'] = compute_rsi(out['close'], 14)

    out['bb_mid'] = out['close'].rolling(20).mean()
    out['bb_std'] = out['close'].rolling(20).std()
    out['bb_up'] = out['bb_mid'] + 2 * out['bb_std']
    out['bb_low'] = out['bb_mid'] - 2 * out['bb_std']

    out['volatility_20'] = out['log_ret'].rolling(20).std() * np.sqrt(20)

    out = out.dropna()
    # Ensure numeric dtypes
    for col in out.columns:
        if not is_numeric_dtype(out[col]):
            out[col] = pd.to_numeric(out[col], errors='coerce')
    out = out.dropna()
    return out


def select_feature_columns(df: pd.DataFrame) -> List[str]:
    cols = [c for c in df.columns if c not in ['close']]
    return cols



## Windowing and splits


In [None]:
# Windowing utilities

def make_supervised_tabular(df: pd.DataFrame,
                            feature_cols: List[str],
                            target_col: str = 'close',
                            lookback: int = DEFAULT_LOOKBACK,
                            horizon: int = DEFAULT_HORIZON) -> Tuple[np.ndarray, np.ndarray, List[pd.Timestamp]]:
    X_list, y_list, ts_list = [], [], []
    values = df.copy()
    for i in range(lookback, len(values) - horizon + 1):
        X_window = values.iloc[i - lookback:i][feature_cols].values
        X_list.append(X_window.reshape(-1))  # flatten window for tabular models
        y_list.append(values.iloc[i + horizon - 1][target_col])
        ts_list.append(values.index[i + horizon - 1])
    return np.array(X_list), np.array(y_list), ts_list


def make_sequence_tensor(df: pd.DataFrame,
                         feature_cols: List[str],
                         target_col: str = 'close',
                         lookback: int = DEFAULT_LOOKBACK,
                         horizon: int = DEFAULT_HORIZON) -> Tuple[np.ndarray, np.ndarray, List[pd.Timestamp]]:
    X_list, y_list, ts_list = [], [], []
    values = df.copy()
    data = values[feature_cols].values
    target = values[target_col].values
    for i in range(lookback, len(values) - horizon + 1):
        X_window = data[i - lookback:i]
        X_list.append(X_window)
        y_list.append(target[i + horizon - 1])
        ts_list.append(values.index[i + horizon - 1])
    return np.array(X_list), np.array(y_list), ts_list


def expanding_window_splits(n_samples: int,
                            n_splits: int = 5,
                            min_train_size: int = 500,
                            test_size: int = 100) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Yield expanding train and rolling test indices."""
    splits = []
    start = 0
    train_end = max(min_train_size, int(n_samples * 0.6))
    while train_end + test_size <= n_samples and len(splits) < n_splits:
        train_idx = np.arange(start, train_end)
        test_idx = np.arange(train_end, train_end + test_size)
        splits.append((train_idx, test_idx))
        train_end += test_size
    return splits


In [None]:
# Model builders

def build_gbr_pipeline(random_state: int = RANDOM_SEED) -> Pipeline:
    return Pipeline([
        ("scaler", StandardScaler(with_mean=True)),
        ("model", GradientBoostingRegressor(random_state=random_state))
    ])


def build_svr_pipeline() -> Pipeline:
    return Pipeline([
        ("scaler", StandardScaler(with_mean=True)),
        ("model", SVR(kernel='rbf', C=10.0, epsilon=0.1, gamma='scale'))
    ])


def build_lstm_model(input_steps: int, input_features: int) -> keras.Model:
    inputs = keras.Input(shape=(input_steps, input_features))
    x = layers.LSTM(64, return_sequences=True)(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(32)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse')
    return model


def build_tcn_model(input_steps: int, input_features: int) -> Optional[keras.Model]:
    if not TCN_AVAILABLE:
        return None
    inputs = keras.Input(shape=(input_steps, input_features))
    x = TCN(nb_filters=64, kernel_size=3, nb_stacks=2, dropout_rate=0.2, dilations=[1,2,4,8])(inputs)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse')
    return model


In [None]:
# Training / evaluation helpers

def evaluate_predictions(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    return {
        "rmse": rmse(y_true, y_pred),
        "mae": float(mean_absolute_error(y_true, y_pred)),
        "mape": mape(y_true, y_pred)
    }


def train_eval_sklearn(X: np.ndarray, y: np.ndarray, splits: List[Tuple[np.ndarray, np.ndarray]], pipeline: Pipeline) -> Dict:
    metrics_list = []
    preds_all, truth_all = [], []
    for fold, (tr, te) in enumerate(splits, 1):
        with Timer(f"SKLearn fold {fold}"):
            pipeline.fit(X[tr], y[tr])
            pred = pipeline.predict(X[te])
        m = evaluate_predictions(y[te], pred)
        metrics_list.append(m)
        preds_all.append(pred)
        truth_all.append(y[te])
    agg = {k: float(np.mean([m[k] for m in metrics_list])) for k in metrics_list[0].keys()}
    return {"fold_metrics": metrics_list, "agg": agg, "preds": np.concatenate(preds_all), "truth": np.concatenate(truth_all)}


def train_eval_keras_seq(X: np.ndarray, y: np.ndarray, splits: List[Tuple[np.ndarray, np.ndarray]], build_model_fn, epochs: int = 20, batch_size: int = 64) -> Dict:
    metrics_list = []
    preds_all, truth_all = [], []
    for fold, (tr, te) in enumerate(splits, 1):
        with Timer(f"Keras fold {fold}"):
            model = build_model_fn(X.shape[1], X.shape[2])
            early = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss')
            history = model.fit(
                X[tr], y[tr],
                validation_data=(X[te], y[te]),
                epochs=epochs,
                batch_size=batch_size,
                verbose=0,
                callbacks=[early]
            )
            pred = model.predict(X[te], verbose=0).ravel()
        m = evaluate_predictions(y[te], pred)
        metrics_list.append(m)
        preds_all.append(pred)
        truth_all.append(y[te])
    agg = {k: float(np.mean([m[k] for m in metrics_list])) for k in metrics_list[0].keys()}
    return {"fold_metrics": metrics_list, "agg": agg, "preds": np.concatenate(preds_all), "truth": np.concatenate(truth_all)}


In [None]:
# Artifact saving and registry

REGISTRY_PATH = ARTIFACT_DIR / "registry.json"

def save_registry(registry: Dict) -> None:
    with open(REGISTRY_PATH, 'w') as f:
        json.dump(registry, f, indent=2)


def load_registry() -> Dict:
    if REGISTRY_PATH.exists():
        with open(REGISTRY_PATH, 'r') as f:
            return json.load(f)
    return {"models": []}


def register_model_entry(registry: Dict, entry: Dict) -> Dict:
    reg = load_registry() if registry is None else registry
    reg.setdefault("models", []).append(entry)
    save_registry(reg)
    return reg


def save_sklearn_model(pipeline: Pipeline, model_name: str, symbol: str, lookback: int, horizon: int) -> Dict:
    ensure_dir(ARTIFACT_DIR)
    path = ARTIFACT_DIR / f"{model_name}_{symbol}_lb{lookback}_h{horizon}.joblib"
    joblib.dump(pipeline, path)
    entry = {
        "type": "sklearn",
        "name": model_name,
        "symbol": symbol,
        "lookback": lookback,
        "horizon": horizon,
        "path": str(path)
    }
    return entry


def save_keras_model(model: keras.Model, model_name: str, symbol: str, lookback: int, horizon: int, scaler_x: Optional[MinMaxScaler] = None, scaler_y: Optional[MinMaxScaler] = None) -> Dict:
    ensure_dir(ARTIFACT_DIR)
    model_path = ARTIFACT_DIR / f"{model_name}_{symbol}_lb{lookback}_h{horizon}.keras"
    model.save(model_path)
    entry = {
        "type": "keras",
        "name": model_name,
        "symbol": symbol,
        "lookback": lookback,
        "horizon": horizon,
        "path": str(model_path),
        "scaler_x": None,
        "scaler_y": None
    }
    if scaler_x is not None:
        sx_path = ARTIFACT_DIR / f"{model_name}_{symbol}_lb{lookback}_h{horizon}_scaler_x.joblib"
        joblib.dump(scaler_x, sx_path)
        entry["scaler_x"] = str(sx_path)
    if scaler_y is not None:
        sy_path = ARTIFACT_DIR / f"{model_name}_{symbol}_lb{lookback}_h{horizon}_scaler_y.joblib"
        joblib.dump(scaler_y, sy_path)
        entry["scaler_y"] = str(sy_path)
    return entry


In [None]:
# Use Case 
SYMBOL = "BTC-USD"
INTERVAL = "1d"

df_bfx = fetch_ohlcv_bitfinex(SYMBOL, interval=INTERVAL, limit=5000)
df_feat_bfx = add_technical_indicators(df_bfx)
features_bfx = select_feature_columns(df_feat_bfx)

X_tab_bfx, y_tab_bfx, ts_tab_bfx = make_supervised_tabular(df_feat_bfx, features_bfx, 'close', DEFAULT_LOOKBACK, DEFAULT_HORIZON)
splits_bfx = expanding_window_splits(len(X_tab_bfx), n_splits=3, min_train_size=500, test_size=100)

res_gbr_bfx = train_eval_sklearn(X_tab_bfx, y_tab_bfx, splits_bfx, build_gbr_pipeline())
print("Bitfinex GBR agg:", res_gbr_bfx["agg"])


In [None]:
# End-to-end example run

SYMBOL = "BTC-USD"
INTERVAL = DEFAULT_INTERVAL
START = DEFAULT_START
END = DEFAULT_END
LOOKBACK = 60
HORIZON = 1

# Fetch data 
df_raw = fetch_ohlcv_yfinance(SYMBOL, start=START, end=END, interval=INTERVAL)


In [None]:

#  ENHANCED DATA INSPECTION 
comprehensive_data_inspection(df_raw, SYMBOL)
_ = plot_data_quality_assessment(df_raw, SYMBOL, save_as=f"{SYMBOL}_data_quality")

df_feat = add_technical_indicators(df_raw)
features = select_feature_columns(df_feat)

# Prepare supervised datasets
X_tab, y_tab, ts_tab = make_supervised_tabular(df_feat, features, 'close', LOOKBACK, HORIZON)
X_seq, y_seq, ts_seq = make_sequence_tensor(df_feat, features, 'close', LOOKBACK, HORIZON)

# Create splits
splits = expanding_window_splits(len(X_tab), n_splits=5, min_train_size=500, test_size=100)

results_summary = {}
registry = load_registry()



In [None]:
# GBR
gbr = build_gbr_pipeline()
res_gbr = train_eval_sklearn(X_tab, y_tab, splits, gbr)
print("GBR agg:", res_gbr["agg"]) 
entry = save_sklearn_model(gbr, "GBR", SYMBOL, LOOKBACK, HORIZON)
registry = register_model_entry(registry, entry)
results_summary["GBR"] = res_gbr["agg"]

# SVR
svr = build_svr_pipeline()
res_svr = train_eval_sklearn(X_tab, y_tab, splits, svr)
print("SVR agg:", res_svr["agg"]) 
entry = save_sklearn_model(svr, "SVR", SYMBOL, LOOKBACK, HORIZON)
registry = register_model_entry(registry, entry)
results_summary["SVR"] = res_svr["agg"]



In [None]:
# LSTM
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
X_seq_scaled = scaler_x.fit_transform(X_seq.reshape(X_seq.shape[0], -1)).reshape(X_seq.shape)
y_seq_scaled = scaler_y.fit_transform(y_seq.reshape(-1, 1)).reshape(-1)

lstm_model = build_lstm_model(X_seq.shape[1], X_seq.shape[2])
res_lstm = train_eval_keras_seq(X_seq_scaled, y_seq_scaled, splits, lambda steps, feats: build_lstm_model(steps, feats))
print("LSTM agg (on scaled):", res_lstm["agg"]) 
entry = save_keras_model(lstm_model, "LSTM", SYMBOL, LOOKBACK, HORIZON, scaler_x, scaler_y)
registry = register_model_entry(registry, entry)
results_summary["LSTM_scaled"] = res_lstm["agg"]



In [None]:
# TCN (optional)
if TCN_AVAILABLE:
    tcn_model = build_tcn_model(X_seq.shape[1], X_seq.shape[2])
    res_tcn = train_eval_keras_seq(X_seq_scaled, y_seq_scaled, splits, lambda steps, feats: build_tcn_model(steps, feats))
    print("TCN agg (on scaled):", res_tcn["agg"]) 
    entry = save_keras_model(tcn_model, "TCN", SYMBOL, LOOKBACK, HORIZON, scaler_x, scaler_y)
    registry = register_model_entry(registry, entry)
    results_summary["TCN_scaled"] = res_tcn["agg"]
else:
    print("TCN not available. Install keras-tcn to enable.")

print("Summary:", json.dumps(results_summary, indent=2))


## Visualization Utilities 

In [None]:
# Visualization utilities
import math
from typing import Iterable
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots


def save_figure(fig, name: str, formats: Iterable[str] = ("html",)) -> None:
    """Save Plotly figure under artifacts/figures.
    formats can include 'html' and/or 'png' (requires kaleido for PNG).
    """
    fig_dir = ARTIFACT_DIR / "figures"
    ensure_dir(fig_dir)
    if "html" in formats:
        fig.write_html(fig_dir / f"{name}.html")
    if "png" in formats:
        try:
            pio.write_image(fig, fig_dir / f"{name}.png", scale=2, width=1280, height=720)
        except Exception as e:
            print(f"PNG export skipped (install kaleido to enable): {e}")



In [None]:

def plot_candlestick(df: pd.DataFrame, title: str = "Candlestick", save_as: str = None):
    df = df.copy()
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.03,
                        row_heights=[0.7, 0.3])
    fig.add_trace(go.Candlestick(x=df.index, open=df['open'], high=df['high'], low=df['low'], close=df['close'],
                                 name='OHLC'), row=1, col=1)
    if 'volume' in df.columns:
        fig.add_trace(go.Bar(x=df.index, y=df['volume'], name='Volume'), row=2, col=1)
    fig.update_layout(title=title, xaxis_rangeslider_visible=False)
    if save_as:
        save_figure(fig, save_as)
    return fig


In [None]:


def plot_indicator_overlays(df: pd.DataFrame, overlays: Iterable[str], title: str = "Indicators", save_as: str = None):
    df = df.copy()
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.04,
                        row_heights=[0.7, 0.3], specs=[[{}],[{}]])
    fig.add_trace(go.Scatter(x=df.index, y=df['close'], name='Close'), row=1, col=1)
    for col in overlays:
        if col in df.columns:
            fig.add_trace(go.Scatter(x=df.index, y=df[col], name=col), row=1, col=1)
    if 'rsi_14' in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df['rsi_14'], name='RSI 14'), row=2, col=1)
        fig.add_hline(y=30, line=dict(color='gray', dash='dot'), row=2, col=1)
        fig.add_hline(y=70, line=dict(color='gray', dash='dot'), row=2, col=1)
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig




In [None]:
def plot_returns_and_volatility(df: pd.DataFrame, title: str = "Returns & Volatility", save_as: str = None):
    df = df.copy()
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    if 'ret' in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df['ret'], name='Return'), secondary_y=False)
    if 'volatility_20' in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df['volatility_20'], name='Vol 20'), secondary_y=True)
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig






In [None]:
def plot_corr_heatmap(df_features: pd.DataFrame, title: str = "Feature Correlation", save_as: str = None):
    corr = df_features.corr(numeric_only=True)
    fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'))
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig

In [None]:

def plot_feature_distributions(df_features: pd.DataFrame, columns: Iterable[str] = None, title: str = "Feature Distributions", save_as: str = None):
    cols = list(columns) if columns else list(df_features.columns)
    n = len(cols)
    rows = math.ceil(n / 2)
    fig = make_subplots(rows=rows, cols=2, subplot_titles=cols)
    r = c = 1
    for col in cols:
        if col not in df_features.columns:
            continue
        fig.add_trace(go.Histogram(x=df_features[col], name=col, nbinsx=50, opacity=0.8), row=r, col=c)
        c = 2 if c == 1 else 1
        if c == 1:
            r += 1
    fig.update_layout(title=title, barmode='overlay')
    if save_as:
        save_figure(fig, save_as)
    return fig







In [None]:
def align_predictions_with_timestamps(ts_list: list, splits: list, y_true_concat: np.ndarray, y_pred_concat: np.ndarray) -> pd.DataFrame:
    """Align concatenated preds/truth to timestamps using the provided splits.
    Returns DataFrame with columns: actual, predicted, residual, fold.
    """
    test_ts_all = []
    fold_ids = []
    for fold_id, (_, te) in enumerate(splits, 1):
        test_ts_all.extend([ts_list[i] for i in te])
        fold_ids.extend([fold_id] * len(te))
    df = pd.DataFrame({
        'ts': pd.to_datetime(test_ts_all),
        'actual': y_true_concat,
        'predicted': y_pred_concat,
        'fold': fold_ids
    }).sort_values('ts')
    df['residual'] = df['actual'] - df['predicted']
    df = df.set_index('ts')
    return df

In [None]:
def plot_walkforward_predictions(ts_list, y_true_concat, y_pred_concat, splits, title: str = "Walk-forward", save_as: str = None):
    dfp = align_predictions_with_timestamps(ts_list, splits, y_true_concat, y_pred_concat)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dfp.index, y=dfp['actual'], name='Actual'))
    fig.add_trace(go.Scatter(x=dfp.index, y=dfp['predicted'], name='Predicted'))
    # Shade test windows
    for (_, te) in splits:
        x0 = ts_list[te[0]]
        x1 = ts_list[te[-1]]
        fig.add_vrect(x0=x0, x1=x1, fillcolor="LightSalmon", opacity=0.15, layer="below", line_width=0)
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig






In [None]:
def plot_residuals_over_time(ts_list, y_true_concat, y_pred_concat, splits, title: str = "Residuals", save_as: str = None):
    dfp = align_predictions_with_timestamps(ts_list, splits, y_true_concat, y_pred_concat)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dfp.index, y=dfp['residual'], name='Residual'))
    if len(dfp) > 20:
        fig.add_trace(go.Scatter(x=dfp.index, y=dfp['residual'].rolling(20).mean(), name='Rolling mean (20)'))
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig

In [None]:

def plot_error_distribution(y_true_concat, y_pred_concat, title: str = "Error Distribution", save_as: str = None):
    resid = (y_true_concat - y_pred_concat).ravel()
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=resid, nbinsx=50, name='Residuals', opacity=0.85))
    fig.add_vline(x=float(np.mean(resid)), line_dash='dash', line_color='red', annotation_text='mean', annotation_position='top')
    fig.update_layout(title=title)
    if save_as:
        save_figure(fig, save_as)
    return fig


In [None]:


def plot_parity(y_true_concat, y_pred_concat, title: str = "Parity Plot", save_as: str = None):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_true_concat, y=y_pred_concat, mode='markers', name='Points', opacity=0.6))
    mn = float(np.min([np.min(y_true_concat), np.min(y_pred_concat)]))
    mx = float(np.max([np.max(y_true_concat), np.max(y_pred_concat)]))
    fig.add_trace(go.Scatter(x=[mn, mx], y=[mn, mx], mode='lines', name='y=x'))
    fig.update_layout(title=title, xaxis_title='Actual', yaxis_title='Predicted')
    if save_as:
        save_figure(fig, save_as)
    return fig





In [None]:
def plot_model_comparison_bar(metrics_by_model: Dict[str, Dict[str, float]], metric: str = 'rmse', title: str = None, save_as: str = None):
    names = []
    values = []
    for k, v in metrics_by_model.items():
        if v is None or metric not in v:
            continue
        names.append(k)
        values.append(v[metric])
    fig = go.Figure(go.Bar(x=names, y=values))
    fig.update_layout(title=title or f"Model comparison by {metric.upper()}", xaxis_title='Model', yaxis_title=metric.upper())
    if save_as:
        save_figure(fig, save_as)
    return fig


In [None]:
# Generate visualizations
_ = plot_candlestick(df_feat.tail(200), title=f"{SYMBOL} - last 200 bars", save_as=f"{SYMBOL}_candles")
_ = plot_indicator_overlays(df_feat.tail(200), overlays=['sma_10','sma_20','ema_10','bb_up','bb_low','rsi_14'],
                            title=f"{SYMBOL} indicators", save_as=f"{SYMBOL}_indicators")
_ = plot_returns_and_volatility(df_feat, title=f"{SYMBOL} returns & volatility", save_as=f"{SYMBOL}_returns_vol")
_ = plot_corr_heatmap(df_feat[select_feature_columns(df_feat)], title="Feature correlation", save_as="features_corr")

cols_for_dist = select_feature_columns(df_feat)
if len(df_feat) > 2000:
    sample_df = df_feat[cols_for_dist].sample(2000, replace=False, random_state=RANDOM_SEED)
else:
    sample_df = df_feat[cols_for_dist]
_ = plot_feature_distributions(sample_df, columns=cols_for_dist[:8], title="Feature distributions (subset)", save_as="features_dists")


In [None]:

# 2) Model comparison bar
metrics_by_model = {
    "GBR": results_summary.get("GBR"),
    "SVR": results_summary.get("SVR"),
    "LSTM (scaled)": results_summary.get("LSTM_scaled"),
}
if TCN_AVAILABLE and "TCN_scaled" in results_summary:
    metrics_by_model["TCN (scaled)"] = results_summary.get("TCN_scaled")
_ = plot_model_comparison_bar(metrics_by_model, metric="rmse", title="Model comparison (RMSE)", save_as="model_comp_rmse")



In [None]:
# 3) Model Evaluation plots
# GBR walk-forward
_ = plot_walkforward_predictions(ts_tab, res_gbr["truth"], res_gbr["preds"], splits, title="GBR walk-forward", save_as="gbr_walk")
_ = plot_residuals_over_time(ts_tab, res_gbr["truth"], res_gbr["preds"], splits, title="GBR residuals", save_as="gbr_residuals")
_ = plot_error_distribution(res_gbr["truth"], res_gbr["preds"], title="GBR error distribution", save_as="gbr_errdist")
_ = plot_parity(res_gbr["truth"], res_gbr["preds"], title="GBR parity", save_as="gbr_parity")



In [None]:
# LSTM (scaled)
try:
    def invert_scaled(arr, scaler):
        return scaler.inverse_transform(np.array(arr).reshape(-1,1)).ravel()
    y_true_lstm = invert_scaled(res_lstm["truth"], scaler_y)
    y_pred_lstm = invert_scaled(res_lstm["preds"], scaler_y)
    _ = plot_walkforward_predictions(ts_seq, y_true_lstm, y_pred_lstm, splits, title="LSTM walk-forward (price)", save_as="lstm_walk")
    _ = plot_residuals_over_time(ts_seq, y_true_lstm, y_pred_lstm, splits, title="LSTM residuals (price)", save_as="lstm_residuals")
    _ = plot_error_distribution(y_true_lstm, y_pred_lstm, title="LSTM error distribution (price)", save_as="lstm_errdist")
    _ = plot_parity(y_true_lstm, y_pred_lstm, title="LSTM parity (price)", save_as="lstm_parity")
except Exception as e:
    print("Skipping LSTM price plots (no scaler or variables not in scope):", e)

