In [None]:
# AWP_pipeline.py
# Builds features CSV and provides streaming warmup + inference pipeline

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from precision_labeller import plot_all_regimes_long
from AWP_inference import load_model, predict_regimes

# ── Part A: Offline feature construction ─────────────────────────────────────
def compute_slope_vol(prices: pd.Series, slope_win: int, vol_win: int) -> pd.DataFrame:
    logp = np.log(prices)
    t = np.arange(slope_win)
    X = np.vstack([t, np.ones_like(t)]).T

    def slope_of_window(y):
        m, _ = np.linalg.lstsq(X, y, rcond=None)[0]
        return m

    slope = (
        logp.rolling(window=slope_win, min_periods=slope_win)
            .apply(slope_of_window, raw=True)
    )
    rtn = logp.diff()
    vol = rtn.rolling(window=vol_win, min_periods=vol_win).std()
    return pd.DataFrame({"slope": slope, "vol": vol})


def build_feature_matrix(
    price_file: str,
    drop_last: int = 10,
    output_csv: str = "features_all_models4.csv"
):
    """
    Builds a CSV of features + autolabelled regimes for all instruments.
    """
    df_raw = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_cols = df_raw.shape
    all_rows = []

    for inst in range(n_cols):
        close = df_raw.iloc[:, inst]
        T = len(close)
        true_regs = plot_all_regimes_long(end_point=T, plot_graph=False, inst=inst)
        true_regs = pd.Series(true_regs[:T - drop_last], name="true_regime")

        feats = pd.DataFrame(index=true_regs.index)
        logp = np.log(close)

        # MA regime
        ma_s = logp.rolling(window=5, min_periods=1).mean()
        ma_l = logp.rolling(window=70, min_periods=1).mean()
        feats["ma_reg"] = np.where(ma_l > ma_s, 0, 2)[:T - drop_last]

        # EMA regime
        ema_s = logp.ewm(span=5, adjust=False).mean()
        ema_l = logp.ewm(span=50, adjust=False).mean()
        feats["ema_reg"] = np.where(ema_s > ema_l, 2, 0)[:T - drop_last]

        # Slope/Vol regime
        sv = compute_slope_vol(close, slope_win=30, vol_win=100).dropna()
        idx = sv.index[sv.index < T - drop_last]
        median_vol = sv.loc[idx, "vol"].median()
        feats.loc[idx, "slope_vol_reg"] = np.where(
            (sv.loc[idx, "slope"] > 0) & (sv.loc[idx, "vol"] < median_vol), 2, 0
        )

        # ... (other feature routines) ...

        feats["true_regime"] = true_regs
        feats["inst"] = inst
        feats["time"] = feats.index
        all_rows.append(feats.reset_index(drop=True))

    final_df = pd.concat(all_rows, ignore_index=True)
    final_df.to_csv(output_csv, index=False)
    print(f"✅ Features written to: {output_csv}")


# ── Part B: Streaming warmup + efficient inference pipeline ─────────────────────────
class StreamingRegimePredictor:
    """
    Stream in price data for multiple instruments, warm up caches,
    and emit a 1D numpy array of regime predictions (one per instrument)
    once both price (100) and feature (20) caches are full.  Until then,
    returns an array filled with -999.
    """
    def __init__(self, checkpoint_name: str, features_csv: str):
        # load frozen LSTM tagger
        self.model, self.device = load_model(checkpoint_name)
        # load static features (post-100 warm-up) for all instruments
        df = pd.read_csv(features_csv)
        df = df.groupby("inst", group_keys=False).apply(lambda g: g).reset_index(drop=True)
        # pivot into array shape (n_inst, seq_len, D)
        feat_cols = [c for c in df.columns if c not in ("inst","time","true_regime")]
        grouped = df.groupby("inst")
        self.static_feats = np.stack([grouped.get_group(i)[feat_cols].values
                                      for i in sorted(grouped.groups)])
        # cache placeholders
        self.price_cache = None     # shape: (n_inst, 100)
        self.feature_cache = None   # shape: (n_inst, 20, D)
        self.n_inst, self.seq_len, self.D = self.static_feats.shape

    def step(self, price_matrix: np.ndarray) -> np.ndarray:
        """
        Ingest the full historical price_matrix of shape (n_inst, T).
        If T < 120, returns array([-999]). Once T >= 120:
         - initialize or roll price_cache to keep last 100 prices
         - initialize or roll feature_cache to keep last 20 feature rows
         - after updating feature_cache, run inference on each instrument

        Returns:
          preds: np.ndarray of shape (n_inst,) of current regime labels,
                 or np.full((n_inst,), -999) if not warmed up.
        """
        if price_matrix.ndim != 2:
            raise ValueError("price_matrix must be 2D: (n_inst, T)")
        n_inst, T = price_matrix.shape
        # not enough history
        if T < 120:
            return np.full((n_inst,), -999, dtype=int)
        # initialize or update price cache
        last_prices = price_matrix[:, -100:]
        if self.price_cache is None:
            self.price_cache = last_prices.copy()
        else:
            # roll then append
            self.price_cache = np.roll(self.price_cache, -1, axis=1)
            self.price_cache[:, -1] = last_prices[:, -1]
        # determine static feature index for this new timestep
        static_idx = T - 100 - 1  # zero-based index into static_feats axis1
        if not (0 <= static_idx < self.seq_len):
            # out of bounds static features
            return np.full((n_inst,), -999, dtype=int)
        # get new feature row for all instruments
        new_feat = self.static_feats[:, static_idx, :]  # (n_inst, D)
        # initialize or update feature cache
        if self.feature_cache is None:
            # first fill: take the first 20 rows of static_feats
            self.feature_cache = self.static_feats[:, :20, :].copy()
        else:
            self.feature_cache = np.roll(self.feature_cache, -1, axis=1)
            self.feature_cache[:, -1, :] = new_feat
        # now caches are warm: run inference
        preds = np.zeros(self.n_inst, dtype=int)
        for i in range(self.n_inst):
            feats_i = self.feature_cache[i]  # shape (20, D)
            preds[i] = predict_regimes(self.model, self.device, feats_i)
        return preds


In [None]:
# AWP_pipeline.py
# Builds features CSV and provides streaming warmup + inference pipeline

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from precision_labeller import plot_all_regimes_long
from AWP_inference import load_model, predict_regimes

# ── Part A: Offline feature construction ─────────────────────────────────────
def compute_slope_vol(prices: pd.Series, slope_win: int, vol_win: int) -> pd.DataFrame:
    logp = np.log(prices)
    t = np.arange(slope_win)
    X = np.vstack([t, np.ones_like(t)]).T

    def slope_of_window(y):
        m, _ = np.linalg.lstsq(X, y, rcond=None)[0]
        return m

    slope = (
        logp.rolling(window=slope_win, min_periods=slope_win)
            .apply(slope_of_window, raw=True)
    )
    rtn = logp.diff()
    vol = rtn.rolling(window=vol_win, min_periods=vol_win).std()
    return pd.DataFrame({"slope": slope, "vol": vol})


def build_feature_matrix(
    price_file: str,
    drop_last: int = 10,
    output_csv: str = "features_all_models4.csv"
):
    """
    Builds a CSV of features + autolabelled regimes for all instruments.
    """
    df_raw = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_cols = df_raw.shape
    all_rows = []

    for inst in range(n_cols):
        close = df_raw.iloc[:, inst]
        T = len(close)
        true_regs = plot_all_regimes_long(end_point=T, plot_graph=False, inst=inst)
        true_regs = pd.Series(true_regs[:T - drop_last], name="true_regime")

        feats = pd.DataFrame(index=true_regs.index)
        logp = np.log(close)

        # MA regime
        ma_s = logp.rolling(window=5, min_periods=1).mean()
        ma_l = logp.rolling(window=70, min_periods=1).mean()
        feats["ma_reg"] = np.where(ma_l > ma_s, 0, 2)[:T - drop_last]

        # EMA regime
        ema_s = logp.ewm(span=5, adjust=False).mean()
        ema_l = logp.ewm(span=50, adjust=False).mean()
        feats["ema_reg"] = np.where(ema_s > ema_l, 2, 0)[:T - drop_last]

        # Slope/Vol regime
        sv = compute_slope_vol(close, slope_win=30, vol_win=100).dropna()
        idx = sv.index[sv.index < T - drop_last]
        median_vol = sv.loc[idx, "vol"].median()
        feats.loc[idx, "slope_vol_reg"] = np.where(
            (sv.loc[idx, "slope"] > 0) & (sv.loc[idx, "vol"] < median_vol), 2, 0
        )

        # ... (other feature routines) ...

        feats["true_regime"] = true_regs
        feats["inst"] = inst
        feats["time"] = feats.index
        all_rows.append(feats.reset_index(drop=True))

    final_df = pd.concat(all_rows, ignore_index=True)
    final_df.to_csv(output_csv, index=False)
    print(f"✅ Features written to: {output_csv}")


# ── Part B: Streaming warmup + efficient inference pipeline ─────────────────────────
class StreamingRegimePredictor:
    """
    Stream in price data for multiple instruments, warm up caches,
    and emit a 1D numpy array of regime predictions (one per instrument)
    once both price (100) and feature (20) caches are full.  Until then,
    returns an array filled with -999.
    """
    def __init__(self, checkpoint_name: str, features_csv: str):
        # load frozen LSTM tagger
        self.model, self.device = load_model(checkpoint_name)
        # load static features (post-100 warm-up) for all instruments
        df = pd.read_csv(features_csv)
        feat_cols = [c for c in df.columns if c not in ("inst","time","true_regime")]
        grouped = df.groupby("inst")
        self.static_feats = np.stack([grouped.get_group(i)[feat_cols].values
                                      for i in sorted(grouped.groups)])
        # cache placeholders
        self.price_cache = None     # shape: (n_inst, 100)
        self.feature_cache = None   # shape: (n_inst, 20, D)
        self.n_inst, self.seq_len, self.D = self.static_feats.shape

    def step(self, price_matrix: np.ndarray) -> np.ndarray:
        """
        Ingest the full historical price_matrix of shape (n_inst, T).
        If T < 120, returns array([-999] * n_inst).
        Once T >= 120, updates caches and emits latest predictions.
        """
        if price_matrix.ndim != 2:
            raise ValueError("price_matrix must be 2D: (n_inst, T)")
        n_inst, T = price_matrix.shape
        if T < 120:
            return np.full((n_inst,), -999, dtype=int)

        # update price cache (last 100)
        last_prices = price_matrix[:, -100:]
        if self.price_cache is None:
            self.price_cache = last_prices.copy()
        else:
            self.price_cache = np.roll(self.price_cache, -1, axis=1)
            self.price_cache[:, -1] = last_prices[:, -1]

        # compute which row of static_feats we need next
        static_idx = T - 100 - 1
        if static_idx < 0 or static_idx >= self.seq_len:
            return np.full((n_inst,), -999, dtype=int)

        new_feat = self.static_feats[:, static_idx, :]  # (n_inst, D)

        # update feature cache (sliding window of length 20)
        if self.feature_cache is None:
            # initialize with the first 20 rows of static_feats
            self.feature_cache = self.static_feats[:, :20, :].copy()
        else:
            self.feature_cache = np.roll(self.feature_cache, -1, axis=1)
            self.feature_cache[:, -1, :] = new_feat

        # run inference on *only* the latest timestep for each instrument
        preds = np.zeros(self.n_inst, dtype=int)
        for i in range(self.n_inst):
            seq_preds     = predict_regimes(
                self.model, self.device, self.feature_cache[i]
            )            # returns shape (20,)
            preds[i] = seq_preds[-1]  # grab the *last* regime in that sequence

        return preds


if __name__ == "__main__":
    # Simulate streaming prices
    price_df = pd.read_csv("prices.txt", sep=r"\s+", header=None)
    # price_df: shape (T, n_inst) -> transpose to (n_inst, T)
    prices = price_df.values.T
    pipeline = StreamingRegimePredictor(
        checkpoint_name="bilstm_tagger.pth",
        features_csv="features_all_models4.csv"
    )
    for t in range(1, prices.shape[1] + 1):
        out = pipeline.step(prices[:, :t])
        if (out == -999).all():
            continue
        print(f"Predictions at t={t}:", out)


In [None]:
%matplotlib inline
import sys
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# make sure Python can find your AWP code
sys.path.append("model_AWP")

from AWP_inference import load_model, predict_regimes
from precision_labeller import plot_all_regimes_long

# ─── helper to find contiguous segments ────────────────────────────────────────
def get_segments(reg):
    changes = np.flatnonzero(reg[1:] != reg[:-1])
    starts  = np.concatenate(([0], changes + 1))
    ends    = np.concatenate((changes, [len(reg)-1]))
    return list(zip(starts, ends, reg[starts]))

# ─── colour maps ───────────────────────────────────────────────────────────────
true_cmap = ListedColormap(["#ccffcc","#f0f0f0","#ffcccc"])
pred_cmap = ListedColormap(["#66cc66","#b0b0b0","#ff6666"])

# ─── 1) load the model ─────────────────────────────────────────────────────────
model, device = load_model("bilstm_tagger.pth")

# ─── 2) load the feature‐matrix and prices ────────────────────────────────────
df = pd.read_csv("features_all_models4.csv")
# drop your first-100 warmups exactly as you did at training time:
df = (
    df
    .groupby("inst", group_keys=False)
    .apply(lambda g: g.iloc[100:])
    .reset_index(drop=True)
)

price_df = pd.read_csv("prices.txt", sep=r"\s+", header=None)

feat_cols = [c for c in df.columns if c not in ("inst","time","true_regime")]

# ─── choose an instrument to demo ──────────────────────────────────────────────
inst = 0
sub  = df[df["inst"]==inst].reset_index(drop=True)
T    = len(sub)

# ─── 3) get true regimes from the autolabeller ────────────────────────────────
true_seq = plot_all_regimes_long(end_point=T, plot_graph=False, inst=inst)
# this returns an array of length T

# ─── 4) run your offline inference once ───────────────────────────────────────
X_inst    = sub[feat_cols].values.astype(np.float32)  # shape (T, D)
pred_seq  = predict_regimes(model, device, X_inst)    # shape (T,)

# ─── 5) mask out any t<120 (still “warming up”) ──────────────────────────────
warmup = 120
masked_pred = np.full_like(pred_seq, fill_value=-1)  # or -999 if you prefer
masked_pred[warmup:] = pred_seq[warmup:]

# ─── 6) align prices (remember you dropped 100 warmups) ───────────────────────
price = price_df.iloc[100:100+T, inst].values  # length T

# ─── 7) now plot ──────────────────────────────────────────────────────────────
fig, (ax_true, ax_pred) = plt.subplots(
    2,1, sharex=True, figsize=(14,6),
    gridspec_kw={"height_ratios":[1,1]}
)

# — True regimes panel —
for s,e,lbl in get_segments(true_seq[warmup:]):
    ax_true.axvspan(s+warmup, e+warmup, color=true_cmap(lbl), alpha=0.5, linewidth=0)
ax_true.plot(price, "k-", linewidth=1.5, label="Price")
ax_true.set_title(f"Inst {inst} — TRUE regimes (t≥{warmup})")
ax_true.set_ylabel("Price")
ax_true.legend(loc="upper left")

# — Predicted regimes panel —
for s,e,lbl in get_segments(masked_pred[warmup:]):
    ax_pred.axvspan(s+warmup, e+warmup, color=pred_cmap(lbl), alpha=0.5, linewidth=0)
ax_pred.plot(price, "k-", linewidth=1.5, label="Price")
ax_pred.set_title(f"Inst {inst} — PREDICTED regimes (t≥{warmup})")
ax_pred.set_xlabel("Time Step")
ax_pred.set_ylabel("Price")
ax_pred.legend(loc="upper left")

plt.tight_layout()
plt.show()
