In [None]:
import numpy as np
from AWP_inference import load_model, predict_regimes

# — assume you have a function like this somewhere —
#    it takes raw prices shape (n_inst,100) and returns
#    features shape (n_inst, 20, D)
def compute_feature_window(price_window: np.ndarray) -> np.ndarray:
    """
    price_window: np.ndarray, shape (n_inst, 100)
    returns: np.ndarray, shape (n_inst, 20, D)
    """
    # your code here: reimplement the same MA/EMA/slope/vol/etc logic
    # but only on these 100 points. You must return a 3D array.
    raise NotImplementedError


def online_inference(price_matrix: np.ndarray,
                     checkpoint: str="bilstm_tagger.pth") -> np.ndarray:
    """
    price_matrix: np.ndarray, shape (n_inst, T)
    checkpoint:   path to your trained PyTorch .pth file
    returns:      np.ndarray, shape (n_inst,), the regime at time T for each instrument
    """
    n_inst, T = price_matrix.shape

    # 1) need at least 100 points
    if T < 100:
        return np.full((n_inst,), -999, dtype=int)

    # 2) slice out the last-100 window
    last100 = price_matrix[:, -100:]   # (n_inst,100)

    # 3) build your feature‐sequence for each instrument
    #    must produce shape (n_inst,20,D)
    feat_seqs = compute_feature_window(last100)

    # 4) load your frozen LSTM once
    model, device = load_model(checkpoint)

    # 5) run it per instrument, take the last step’s label
    preds = np.zeros((n_inst,), dtype=int)
    for i in range(n_inst):
        seq_preds = predict_regimes(model, device, feat_seqs[i])
        preds[i]  = seq_preds[-1]

    return preds


In [None]:
# AWP_pipeline.py
# Builds features CSV and provides streaming warmup + inference pipeline

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from precision_labeller import plot_all_regimes_long
from AWP_inference import load_model, predict_regimes


# ── Part B: Streaming warmup + efficient inference pipeline ─────────────────────────
class StreamingRegimePredictor:
    """
    Stream in price data for multiple instruments, warm up caches,
    and emit a 1D numpy array of regime predictions (one per instrument)
    once both price (100) and feature (20) caches are full.  Until then,
    returns an array filled with -999.
    """
    def __init__(self, checkpoint_name: str, features_csv: str):
        # load frozen LSTM tagger
        self.model, self.device = load_model(checkpoint_name)
        # load static features (post-100 warm-up) for all instruments
        df = pd.read_csv(features_csv)
        df = df.groupby("inst", group_keys=False).apply(lambda g: g).reset_index(drop=True)
        # pivot into array shape (n_inst, seq_len, D)
        feat_cols = [c for c in df.columns if c not in ("inst","time","true_regime")]
        grouped = df.groupby("inst")
        self.static_feats = np.stack([grouped.get_group(i)[feat_cols].values
                                      for i in sorted(grouped.groups)])
        # cache placeholders
        self.price_cache = None     # shape: (n_inst, 100)
        self.feature_cache = None   # shape: (n_inst, 20, D)
        self.n_inst, self.seq_len, self.D = self.static_feats.shape

    def step(self, price_matrix: np.ndarray) -> np.ndarray:
        """
        Ingest the full historical price_matrix of shape (n_inst, T).
        If T < 120, returns array([-999]). Once T >= 120:
         - initialize or roll price_cache to keep last 100 prices
         - initialize or roll feature_cache to keep last 20 feature rows
         - after updating feature_cache, run inference on each instrument

        Returns:
          preds: np.ndarray of shape (n_inst,) of current regime labels,
                 or np.full((n_inst,), -999) if not warmed up.
        """
        if price_matrix.ndim != 2:
            raise ValueError("price_matrix must be 2D: (n_inst, T)")
        n_inst, T = price_matrix.shape
        # not enough history
        if T < 120:
            return np.full((n_inst,), -999, dtype=int)
        # initialize or update price cache
        last_prices = price_matrix[:, -100:]
        if self.price_cache is None:
            self.price_cache = last_prices.copy()
        else:
            # roll then append
            self.price_cache = np.roll(self.price_cache, -1, axis=1)
            self.price_cache[:, -1] = last_prices[:, -1]
        # determine static feature index for this new timestep
        static_idx = T - 100 - 1  # zero-based index into static_feats axis1
        if not (0 <= static_idx < self.seq_len):
            # out of bounds static features
            return np.full((n_inst,), -999, dtype=int)
        # get new feature row for all instruments
        new_feat = self.static_feats[:, static_idx, :]  # (n_inst, D)
        # initialize or update feature cache
        if self.feature_cache is None:
            # first fill: take the first 20 rows of static_feats
            self.feature_cache = self.static_feats[:, :20, :].copy()
        else:
            self.feature_cache = np.roll(self.feature_cache, -1, axis=1)
            self.feature_cache[:, -1, :] = new_feat
        # now caches are warm: run inference
        preds = np.zeros(self.n_inst, dtype=int)
        for i in range(self.n_inst):
            feats_i = self.feature_cache[i]  # shape (20, D)
            preds[i] = predict_regimes(self.model, self.device, feats_i)
        return preds


In [None]:
import numpy as np
import pandas as pd
from AWP_inference import load_model, predict_regimes

# ─── 0) LOAD MODEL ──────────────────────────────────────────────────────────────
model, device = load_model("bilstm_tagger.pth")

# ─── 1) LOAD PRECOMPUTED FEATURES ──────────────────────────────────────────────
# (must match the CSV you used in offline training)
df_feat = pd.read_csv("features_all_models4.csv")
df_feat = (
    df_feat
    .groupby("inst", group_keys=False)
    .apply(lambda g: g.iloc[100:])   # drop the same 100 warm-ups
    .reset_index(drop=True)
)
feat_cols   = [c for c in df_feat.columns if c not in ("inst","time","true_regime")]
grouped     = df_feat.groupby("inst")
static_feats = np.stack([grouped.get_group(i)[feat_cols].values
                         for i in sorted(grouped.groups)])
# static_feats.shape == (n_inst, T_feat, D)
n_inst, T_feat, D = static_feats.shape

# ─── 2) DEFINE STEP FUNCTION ───────────────────────────────────────────────────
def step_from_prices(prices_2d: np.ndarray) -> np.ndarray:
    """
    prices_2d: shape (n_inst, T)
    returns:   shape (n_inst,) with the predicted regime at time T for each instrument
               or -999 if T < 100
    """
    n_inst_, T = prices_2d.shape
    assert n_inst_ == n_inst, f"expected {n_inst} instruments"
    # need at least 100 points to index static_feats
    if T < 100:
        return np.full(n_inst, -999, dtype=int)
    # static index aligned so that static_feats[:,0] is features at t=100
    static_idx = T - 100 - 1
    if static_idx < 19 or static_idx >= T_feat:
        # not enough to fill 20-step LSTM window
        return np.full(n_inst, -999, dtype=int)
    # grab the 20×D window for each instrument
    feat_window = static_feats[:, static_idx-19:static_idx+1, :]  # shape (n_inst,20,D)

    # run inference per instrument
    preds = np.zeros(n_inst, dtype=int)
    for i in range(n_inst):
        seq_preds = predict_regimes(model, device, feat_window[i])  # length 20
        preds[i]  = seq_preds[-1]  # take only the final label
    return preds

# ─── 3) SIMULATE STREAMING ────────────────────────────────────────────────────
# read raw prices and transpose → (n_inst, T_raw)
price_df = pd.read_csv("prices.txt", sep=r"\s+", header=None)
prices   = price_df.values.T
n_inst_, T_raw = prices.shape

print(f"Streaming from t=1 to t={T_raw}  (n_inst={n_inst})")
for t in range(1, T_raw+1):
    preds = step_from_prices(prices[:, :t])
    print(f"t={t:3d}: {preds}")


In [None]:
import numpy as np
import pandas as pd
from AWP_pipeline import compute_features_for_latest  # adjust import as needed

# 1) Load the price data: rows = timesteps, columns = instruments
df = pd.read_csv('prices.txt', sep=r'\s+', header=None)

# 2) Convert to a NumPy array of shape (n_inst, T)
#    df.values is (T, n_inst) so we transpose
prices = df.values.T

n_inst, T = prices.shape

# 3) Slide a 100-bar window (the "trailing 99 + current bar") from t=99 up to T-1
for t in range(99, T):
    # extract a window of shape (n_inst, 100)
    window = prices[:, t-99:t+1]
    
    # compute the 9 regimes per instrument at this timestep
    feats = compute_features_for_latest(window)  # returns (n_inst, 9)
    
    # print them; you could also store them in a list or DataFrame
    print(f"t={t:4d} →")
    for inst_idx, inst_feats in enumerate(feats):
        print(f"  Inst {inst_idx:2d}: {inst_feats.tolist()}")
    print('-' * 50)


In [None]:
import numpy as np
import pandas as pd
from precision_labeller import plot_all_regimes_long


def build_feature_matrix_from_array(
    prices_array: np.ndarray,
    output_csv: str = "features_all_models_from_array.csv"
) -> None:
    """
    Build a feature matrix for multiple regime models from a 2D numpy array of prices.

    Args:
        prices_array: 2D array of shape (T, n_inst), where each column is one instrument's price series.
        output_csv: Path to write the resulting CSV file.
    """
    # Validate input
    if prices_array.ndim != 2:
        raise ValueError("prices_array must be a 2D array of shape (T, n_inst)")

    T, n_inst = prices_array.shape
    all_rows = []

    for inst in range(n_inst):
        close = pd.Series(prices_array[:, inst])
        high = close.copy()
        low = close.copy()

        # Ground truth regimes for full series
        true_regs = plot_all_regimes_long(end_point=T, plot_graph=False, inst=inst)
        true_regs = pd.Series(true_regs, name="true_regime")

        # Prepare DataFrame for this instrument
        features = pd.DataFrame(index=np.arange(T))
        logp = np.log(close)

        # MA-based regime
        ma_s = logp.rolling(window=5, min_periods=1).mean()
        ma_l = logp.rolling(window=70, min_periods=1).mean()
        features["ma_reg"] = np.where(ma_l > ma_s, 0, 2)

        # EMA-based regime
        ema_s = logp.ewm(span=5, adjust=False).mean()
        ema_l = logp.ewm(span=50, adjust=False).mean()
        features["ema_reg"] = np.where(ema_s > ema_l, 2, 0)

        # Slope/Vol regime (only computed where full window available)
        sv_df = compute_slope_vol(close, slope_win=30, vol_win=100).dropna()
        idx = sv_df.index  # valid window endpoints
        slope = sv_df["slope"]
        vol = sv_df["vol"]
        median_vol = vol.median()
        # Initialize column with NaN so early indexes remain NaN
        features["slope_vol_reg"] = np.nan
        features.loc[idx, "slope_vol_reg"] = np.where(
            (slope > 0) & (vol < median_vol), 2, 0
        )

        # MACD regime
        ema_s2 = logp.ewm(span=50, adjust=False).mean()
        ema_l2 = logp.ewm(span=90, adjust=False).mean()
        macd = ema_s2 - ema_l2
        signal = macd.ewm(span=40, adjust=False).mean()
        features["macd_reg"] = np.where(macd > signal, 2, 0)

        # Kalman regime
        x_est = np.zeros(T)
        P = np.zeros(T)
        x_est[0], P[0] = logp.iloc[0], 1.0
        for t in range(1, T):
            x_pred = x_est[t - 1]
            P_pred = P[t - 1] + 0.01
            K = P_pred / (P_pred + 10.0)
            x_est[t] = x_pred + K * (logp.iloc[t] - x_pred)
            P[t] = (1 - K) * P_pred
        features["kalman_reg"] = np.where(
            logp > x_est, 2, 0
        )

        # Fibonacci regime
        high_win = close.rolling(window=50, min_periods=50).max()
        low_win = close.rolling(window=50, min_periods=50).min()
        fib_range = high_win - low_win
        lower = low_win + 0.618 * fib_range
        upper = low_win + 0.786 * fib_range
        features["fib_reg"] = np.where(
            close > upper, 2,
            np.where(close < lower, 0, 1)
        )

        # PSAR regime
        psar = np.zeros(T)
        trend_up = True
        af, max_step = 0.01, 0.10
        ep = high.iloc[0]
        psar[0] = low.iloc[0]
        for t in range(1, T):
            prev = psar[t - 1]
            psar[t] = prev + af * (ep - prev)
            if trend_up:
                if low.iloc[t] < psar[t]:
                    trend_up = False
                    psar[t], ep, af = ep, low.iloc[t], 0.01
                elif high.iloc[t] > ep:
                    ep = high.iloc[t]
                    af = min(af + 0.01, max_step)
            else:
                if high.iloc[t] > psar[t]:
                    trend_up = True
                    psar[t], ep, af = ep, high.iloc[t], 0.01
                elif low.iloc[t] < ep:
                    ep = low.iloc[t]
                    af = min(af + 0.01, max_step)
        features["psar_reg"] = np.where(
            close > psar, 2, 0
        )

        # Z-score regime
        ma90 = close.rolling(window=90, min_periods=90).mean()
        sd90 = close.rolling(window=90, min_periods=90).std()
        z = (close - ma90) / sd90
        features["zscore_reg"] = np.where(
            z > 0.5, 2,
            np.where(z < -0.5, 0, 1)
        )

        # Weighted-return regime
        r = close.pct_change()
        weights = np.arange(1, 46) ** 0.5
        weights /= weights.sum()
        wr = r.rolling(window=45, min_periods=45).apply(lambda x: np.dot(x, weights), raw=True)
        features["wret_reg"] = np.where(
            wr > 0, 2,
            np.where(wr < 0, 0, 1)
        )

        # Final columns
        features["true_regime"] = true_regs
        features["inst"] = inst
        features["time"] = features.index

        all_rows.append(features.reset_index(drop=True))

    final_df = pd.concat(all_rows, ignore_index=True)
    final_df.to_csv(output_csv, index=False)
    print(f"✅ Features written to: {output_csv}")

In [None]:


import numpy as np

def load_prices_to_array(price_file: str, delim_whitespace: bool = True) -> np.ndarray:
    """
    Reads a text file where each column is one instrument and each row is a timestep,
    and returns a 2D NumPy array of shape (T, n_inst).

    Args:
        price_file: Path to your prices.txt file.
        delim_whitespace: If True, splits on any whitespace (default). If False,
                         you can pass a specific delimiter via np.loadtxt's delimiter kwarg.

    Returns:
        A NumPy array of shape (T, n_inst), where T is the number of timesteps.
    """
    if delim_whitespace:
        data = np.loadtxt(price_file)
    else:
        # example: use comma-delimited CSV
        data = np.loadtxt(price_file, delimiter=',')
    return data


prices_array = load_prices_to_array("prices.txt")
print(prices_array.shape)  # (number_of_timesteps, number_of_instruments)


# Example usage:
# prices_array: numpy array of shape (T, n_inst)
build_feature_matrix_from_array(prices_array, output_csv="features10.csv")


In [None]:
# regime_inference.py
import numpy as np
import pandas as pd

# ───────────────────────── helpers ───────────────────────────────────────
def _ols_slope(y: np.ndarray) -> float:
    t = np.arange(len(y))
    X = np.vstack([t, np.ones_like(t)]).T
    m, _ = np.linalg.lstsq(X, y, rcond=None)[0]
    return m


# ─── updated helper ─────────────────────────────────────────────────────
def _slope_vol_reg(close: np.ndarray,
                   idx: int,
                   slope_win: int = 30,
                   vol_win: int   = 100) -> float | int:
    logp = np.log(close)

    slope_series = (
        pd.Series(logp)
          .rolling(slope_win, min_periods=slope_win)
          .apply(lambda arr: _ols_slope(arr), raw=True)
    )
    rtn = pd.Series(logp).diff()
    vol_series = rtn.rolling(vol_win, min_periods=vol_win).std()

    slope = slope_series.iloc[idx]
    vol   = vol_series.iloc[idx]

    if np.isnan(slope) or np.isnan(vol):
        return np.nan

    # 100-bar rolling *median* (causal, matches training pipeline)
    median_vol = (
        vol_series
          .rolling(window=100, min_periods=100)
          .median()
          .iloc[idx]
    )

    return 2 if (slope > 0 and vol < median_vol) else 0



# ────────────────────── pipeline (no drop_last) ──────────────────────────
def compute_regime_features_window(prices_window: np.ndarray) -> np.ndarray:
    """
    Parameters
    ----------
    prices_window : np.ndarray
        Shape (50, 100).  Each row is one instrument’s 100-bar history
        ending at the timestep for which we want predictions.

    Returns
    -------
    np.ndarray
        Shape (50, 9).  Columns in training order:
        [ma, ema, slope_vol, macd, kalman, fib, psar, zscore, wret]
    """
    n_inst, win_len = prices_window.shape
    idx = win_len - 1                     # evaluate at the latest bar

    out = np.full((n_inst, 9), np.nan)
    sqrt_weights = np.arange(1, 46, dtype=float) ** 0.5
    sqrt_weights /= sqrt_weights.sum()

    for i in range(n_inst):
        close = prices_window[i]
        logp  = np.log(close)

        # MA regime
        ma_s = pd.Series(logp).rolling(5).mean().iloc[idx]
        ma_l = pd.Series(logp).rolling(70).mean().iloc[idx]
        ma_reg = 0 if ma_l > ma_s else 2

        # EMA regime
        ema_s = pd.Series(logp).ewm(span=5,  adjust=False).mean().iloc[idx]
        ema_l = pd.Series(logp).ewm(span=50, adjust=False).mean().iloc[idx]
        ema_reg = 2 if ema_s > ema_l else 0

        # Slope/Vol regime
        sv_reg = _slope_vol_reg(close, idx)

        # MACD regime
        macd_line = (
            pd.Series(logp).ewm(50, adjust=False).mean()
            - pd.Series(logp).ewm(90, adjust=False).mean()
        )
        signal_line = macd_line.ewm(span=40, adjust=False).mean()
        macd_reg = 2 if macd_line.iloc[idx] > signal_line.iloc[idx] else 0

        # Kalman trend regime
        proc_var, meas_var = 0.01, 10.0
        x_est = np.zeros(win_len)
        P     = np.zeros(win_len)
        x_est[0], P[0] = logp[0], 1.0
        for t in range(1, win_len):
            x_pred = x_est[t - 1]
            P_pred = P[t - 1] + proc_var
            K      = P_pred / (P_pred + meas_var)
            x_est[t] = x_pred + K * (logp[t] - x_pred)
            P[t]     = (1 - K) * P_pred
        kalman_reg = 2 if logp[idx] > x_est[idx] else 0

        # Fibonacci regime
        if idx >= 50:
            win50 = close[idx - 49 : idx + 1]
            hi, lo = win50.max(), win50.min()
            rng = hi - lo
            upper, lower = lo + 0.786 * rng, lo + 0.618 * rng
            fib_reg = 2 if close[idx] > upper else 0 if close[idx] < lower else 1
        else:
            fib_reg = np.nan

        # PSAR regime
        psar = np.empty(win_len)
        trend_up, af, max_af = True, 0.01, 0.10
        ep = close[0]
        psar[0] = close[0]
        for t in range(1, win_len):
            psar[t] = psar[t - 1] + af * (ep - psar[t - 1])
            if trend_up:
                if close[t] < psar[t]:
                    trend_up, psar[t], ep, af = False, ep, close[t], 0.01
                elif close[t] > ep:
                    ep, af = close[t], min(af + 0.01, max_af)
            else:
                if close[t] > psar[t]:
                    trend_up, psar[t], ep, af = True, ep, close[t], 0.01
                elif close[t] < ep:
                    ep, af = close[t], min(af + 0.01, max_af)
        psar_reg = 2 if close[idx] > psar[idx] else 0

        # Z-score regime
        ma90 = pd.Series(close).rolling(90).mean().iloc[idx]
        sd90 = pd.Series(close).rolling(90).std().iloc[idx]
        if np.isnan(ma90) or np.isnan(sd90):
            zscore_reg = np.nan
        else:
            z = (close[idx] - ma90) / sd90
            zscore_reg = 2 if z > 0.5 else 0 if z < -0.5 else 1

        # Weighted-return regime
        if idx >= 45:
            r = pd.Series(close).pct_change().iloc[idx - 44 : idx + 1].values
            wr = np.dot(r, sqrt_weights)
            wret_reg = 2 if wr > 0 else 0 if wr < 0 else 1
        else:
            wret_reg = np.nan

        out[i] = [
            ma_reg, ema_reg, sv_reg, macd_reg, kalman_reg,
            fib_reg, psar_reg, zscore_reg, wret_reg,
        ]

    return out


# ──────────────────── I/O wrappers for prices.txt ───────────────────────
def _extract_window(price_file: str,
                    timestep: int,
                    win_len: int = 100) -> np.ndarray:
    """
    Slice the latest `win_len` bars (inclusive) ending at `timestep` from the
    price file and transpose to (n_inst, win_len).
    """
    df = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_inst = df.shape

    if not (0 <= timestep < n_rows):
        raise ValueError(f"timestep {timestep} out of range (0 … {n_rows-1})")
    if timestep < win_len - 1:
        raise ValueError("Not enough history to build a 100-bar window.")

    slice_df = df.iloc[timestep - win_len + 1 : timestep + 1, :]
    return slice_df.to_numpy().T            # (n_inst, win_len)


def infer_from_file(price_file: str,
                    timestep: int) -> np.ndarray:
    """
    High-level convenience wrapper:
    1. read prices.txt
    2. build the (50,100) window ending at `timestep`
    3. run the regime-feature pipeline
    """
    window = _extract_window(price_file, timestep, win_len=102)
    return compute_regime_features_window(window)


In [None]:


import numpy as np

def load_prices_to_array(price_file: str, delim_whitespace: bool = True) -> np.ndarray:
    """
    Reads a text file where each column is one instrument and each row is a timestep,
    and returns a 2D NumPy array of shape (T, n_inst).

    Args:
        price_file: Path to your prices.txt file.
        delim_whitespace: If True, splits on any whitespace (default). If False,
                         you can pass a specific delimiter via np.loadtxt's delimiter kwarg.

    Returns:
        A NumPy array of shape (T, n_inst), where T is the number of timesteps.
    """
    if delim_whitespace:
        data = np.loadtxt(price_file)
    else:
        # example: use comma-delimited CSV
        data = np.loadtxt(price_file, delimiter=',')
    return data


prices_array = load_prices_to_array("prices.txt")
print(prices_array.shape)  # (number_of_timesteps, number_of_instruments)


# Example usage:
# prices_array: numpy array of shape (T, n_inst)
from AWP_pipeline import build_feature_matrix_from_array
build_feature_matrix_from_array(prices_array)


In [None]:
import numpy as np

def load_prices_to_array(
    price_file: str,
    endpoint: int,
    delim_whitespace: bool = True
) -> np.ndarray:
    """
    Reads a text file where each column is one instrument and each row is a timestep,
    then returns a 2D NumPy array of shape (endpoint+1, n_inst),
    i.e. all rows from 0 up to and including `endpoint`.

    Args:
        price_file: Path to your prices.txt file.
        endpoint:   Zero-based index of the last row to include.
        delim_whitespace: If True, splits on any whitespace (default).
    """
    if delim_whitespace:
        data = np.loadtxt(price_file)
    else:
        data = np.loadtxt(price_file, delimiter=',')
    # slice to keep only timesteps 0..endpoint
    if endpoint < 0 or endpoint >= data.shape[0]:
        raise IndexError(f"endpoint {endpoint} is out of bounds for data with {data.shape[0]} rows")
    return data[: endpoint + 1, :]

# usage:
prices_array = load_prices_to_array("prices.txt", endpoint=100)
print
print(prices_array.shape)  # (750, number_of_instruments)

# then call your feature‐matrix builder as before:
build_feature_matrix_from_array(prices_array)


In [None]:
import numpy as np
import pandas as pd
from precision_labeller import plot_all_regimes_long


def build_feature_matrix_from_array(
    prices_array: np.ndarray
) -> None:
    """
    Compute regime features at the latest timestep for each instrument,
    drop 'inst' and 'time', stack each feature row into a 2D array, and print it.

    Args:
        prices_array: 2D array of shape (T, n_inst), where each column is one instrument's price series.
    """
    if prices_array.ndim != 2:
        raise ValueError("prices_array must be a 2D array of shape (T, n_inst)")

    T, n_inst = prices_array.shape
    latest_idx = T - 1
    feature_rows = []

    for inst in range(n_inst):
        close = pd.Series(prices_array[:, inst])
        high = close.copy()
        low = close.copy()
        logp = np.log(close)

        # True regime label
        true_regs = pd.Series(
            plot_all_regimes_long(end_point=T, plot_graph=False, inst=inst)
        )
        true_label = true_regs.iloc[-1]

        # Compute indicators at latest index
        ma_s = logp.rolling(5, min_periods=1).mean().iloc[latest_idx]
        ma_l = logp.rolling(70, min_periods=1).mean().iloc[latest_idx]
        ema_s = logp.ewm(span=5, adjust=False).mean().iloc[latest_idx]
        ema_l = logp.ewm(span=50, adjust=False).mean().iloc[latest_idx]

                        # Slope/Vol regime (exactly at latest timestep)
        sv_df_full = compute_slope_vol(close, slope_win=30, vol_win=100)
        if latest_idx in sv_df_full.index:
            slope_val = sv_df_full.loc[latest_idx, 'slope']
            vol_val = sv_df_full.loc[latest_idx, 'vol']
            # compute median on all vol values where not NaN
            median_vol = sv_df_full['vol'].dropna().median()
            slope_vol_reg = 2 if (slope_val > 0 and vol_val < median_vol) else 0
        else:
            slope_vol_reg = np.nan

        # MACD
        ema_s2_series = logp.ewm(span=50, adjust=False).mean()
        ema_l2_series = logp.ewm(span=90, adjust=False).mean()
        macd_series = ema_s2_series - ema_l2_series
        signal_series = macd_series.ewm(span=40, adjust=False).mean()
        macd_val = macd_series.iloc[latest_idx]
        signal_val = signal_series.iloc[latest_idx]
        macd_reg = 2 if macd_val > signal_val else 0

        # Kalman
        x_est = np.zeros(T)
        P = np.zeros(T)
        x_est[0], P[0] = logp.iloc[0], 1.0
        for t in range(1, T):
            P_pred = P[t-1] + 0.01
            K = P_pred / (P_pred + 10.0)
            x_est[t] = x_est[t-1] + K * (logp.iloc[t] - x_est[t-1])
            P[t] = (1 - K) * P_pred
        kalman_reg = 2 if logp.iloc[latest_idx] > x_est[latest_idx] else 0

        # Fibonacci
        high_win = close.rolling(50, min_periods=50).max().iloc[latest_idx]
        low_win = close.rolling(50, min_periods=50).min().iloc[latest_idx]
        fib_range = high_win - low_win
        lower = low_win + 0.618 * fib_range
        upper = low_win + 0.786 * fib_range
        fib_reg = 2 if close.iloc[latest_idx] > upper else (0 if close.iloc[latest_idx] < lower else 1)

        # PSAR
        psar = np.zeros(T)
        trend_up, af, max_step = True, 0.01, 0.10
        ep = high.iloc[0]
        for t in range(1, T):
            prev = psar[t-1]
            psar[t] = prev + af * (ep - prev)
            if trend_up:
                if low.iloc[t] < psar[t]: trend_up, psar[t], ep, af = False, ep, low.iloc[t], 0.01
                elif high.iloc[t] > ep: ep, af = high.iloc[t], min(af+0.01, max_step)
            else:
                if high.iloc[t] > psar[t]: trend_up, psar[t], ep, af = True, ep, high.iloc[t], 0.01
                elif low.iloc[t] < ep: ep, af = low.iloc[t], min(af+0.01, max_step)
        psar_reg = 2 if close.iloc[latest_idx] > psar[latest_idx] else 0

        # Z-score
        ma90 = close.rolling(90, min_periods=90).mean().iloc[latest_idx]
        sd90 = close.rolling(90, min_periods=90).std().iloc[latest_idx]
        z = (close.iloc[latest_idx] - ma90) / sd90
        zscore_reg = 2 if z > 0.5 else (0 if z < -0.5 else 1)

        # Weighted return
        r = close.pct_change()
        weights = np.arange(1,46)**0.5; weights /= weights.sum()
        wr = r.rolling(45, min_periods=45).apply(lambda x: np.dot(x, weights), raw=True).iloc[latest_idx]
        wret_reg = 2 if wr > 0 else (0 if wr < 0 else 1)

        # MA regime
        ma_reg = 0 if ma_l > ma_s else 2
        # EMA regime
        ema_reg = 2 if ema_s > ema_l else 0

        # Assemble feature row (drop inst, time)
        feature_row = [
            ma_reg, ema_reg, slope_vol_reg, macd_reg,
            kalman_reg, fib_reg, psar_reg, zscore_reg,
            wret_reg, true_label
        ]
        feature_rows.append(feature_row)

    # Stack into 2D array and print
    feature_matrix = np.vstack(feature_rows)
    print(feature_matrix)


In [None]:
import numpy as np

def load_prices_to_array(
    price_file: str,
    endpoint: int,
    delim_whitespace: bool = True
) -> np.ndarray:
    """
    Reads a text file where each column is one instrument and each row is a timestep,
    then returns a 2D NumPy array of shape (endpoint+1, n_inst),
    i.e. all rows from 0 up to and including `endpoint`.

    Args:
        price_file: Path to your prices.txt file.
        endpoint:   Zero-based index of the last row to include.
        delim_whitespace: If True, splits on any whitespace (default).
    """
    if delim_whitespace:
        data = np.loadtxt(price_file)
    else:
        data = np.loadtxt(price_file, delimiter=',')
    # slice to keep only timesteps 0..endpoint
    if endpoint < 0 or endpoint >= data.shape[0]:
        raise IndexError(f"endpoint {endpoint} is out of bounds for data with {data.shape[0]} rows")
    return data[: endpoint + 1, :]

# usage:
prices_array = load_prices_to_array("prices.txt", endpoint=100)
print(prices_array)
print(prices_array.shape)  # (750, number_of_instruments)

# then call your feature‐matrix builder as before:
build_feature_matrix_from_array(prices_array)


In [None]:
# regime_inference.py
import numpy as np
import pandas as pd

# ───────────────────────── helpers ───────────────────────────────────────
def _ols_slope(y: np.ndarray) -> float:
    t = np.arange(len(y))
    X = np.vstack([t, np.ones_like(t)]).T
    m, _ = np.linalg.lstsq(X, y, rcond=None)[0]
    return m


# ─── updated helper ─────────────────────────────────────────────────────
def _slope_vol_reg(close: np.ndarray,
                   idx: int,
                   slope_win: int = 30,
                   vol_win: int   = 100) -> float | int:
    logp = np.log(close)

    slope_series = (
        pd.Series(logp)
          .rolling(slope_win, min_periods=slope_win)
          .apply(lambda arr: _ols_slope(arr), raw=True)
    )
    rtn = pd.Series(logp).diff()
    vol_series = rtn.rolling(vol_win, min_periods=vol_win).std()

    slope = slope_series.iloc[idx]
    vol   = vol_series.iloc[idx]

    if np.isnan(slope) or np.isnan(vol):
        return np.nan

    # 100-bar rolling *median* (causal, matches training pipeline)
    median_vol = (
        vol_series
          .rolling(window=100, min_periods=100)
          .median()
          .iloc[idx]
    )

    return 2 if (slope > 0 and vol < median_vol) else 0



# ────────────────────── pipeline (no drop_last) ──────────────────────────
def compute_regime_features_window(prices_window: np.ndarray) -> np.ndarray:
    """
    Parameters
    ----------
    prices_window : np.ndarray
        Shape (50, 100).  Each row is one instrument’s 100-bar history
        ending at the timestep for which we want predictions.

    Returns
    -------
    np.ndarray
        Shape (50, 9).  Columns in training order:
        [ma, ema, slope_vol, macd, kalman, fib, psar, zscore, wret]
    """
    n_inst, win_len = prices_window.shape
    idx = win_len - 1                     # evaluate at the latest bar

    out = np.full((n_inst, 9), np.nan)
    sqrt_weights = np.arange(1, 46, dtype=float) ** 0.5
    sqrt_weights /= sqrt_weights.sum()

    for i in range(n_inst):
        close = prices_window[i]
        logp  = np.log(close)

        # MA regime
        ma_s = pd.Series(logp).rolling(5).mean().iloc[idx]
        ma_l = pd.Series(logp).rolling(70).mean().iloc[idx]
        ma_reg = 0 if ma_l > ma_s else 2

        # EMA regime
        ema_s = pd.Series(logp).ewm(span=5,  adjust=False).mean().iloc[idx]
        ema_l = pd.Series(logp).ewm(span=50, adjust=False).mean().iloc[idx]
        ema_reg = 2 if ema_s > ema_l else 0

        # Slope/Vol regime
        sv_reg = _slope_vol_reg(close, idx)

        # MACD regime
        macd_line = (
            pd.Series(logp).ewm(50, adjust=False).mean()
            - pd.Series(logp).ewm(90, adjust=False).mean()
        )
        signal_line = macd_line.ewm(span=40, adjust=False).mean()
        macd_reg = 2 if macd_line.iloc[idx] > signal_line.iloc[idx] else 0

        # Kalman trend regime
        proc_var, meas_var = 0.01, 10.0
        x_est = np.zeros(win_len)
        P     = np.zeros(win_len)
        x_est[0], P[0] = logp[0], 1.0
        for t in range(1, win_len):
            x_pred = x_est[t - 1]
            P_pred = P[t - 1] + proc_var
            K      = P_pred / (P_pred + meas_var)
            x_est[t] = x_pred + K * (logp[t] - x_pred)
            P[t]     = (1 - K) * P_pred
        kalman_reg = 2 if logp[idx] > x_est[idx] else 0

        # Fibonacci regime
        if idx >= 50:
            win50 = close[idx - 49 : idx + 1]
            hi, lo = win50.max(), win50.min()
            rng = hi - lo
            upper, lower = lo + 0.786 * rng, lo + 0.618 * rng
            fib_reg = 2 if close[idx] > upper else 0 if close[idx] < lower else 1
        else:
            fib_reg = np.nan

        # PSAR regime
        psar = np.empty(win_len)
        trend_up, af, max_af = True, 0.01, 0.10
        ep = close[0]
        psar[0] = close[0]
        for t in range(1, win_len):
            psar[t] = psar[t - 1] + af * (ep - psar[t - 1])
            if trend_up:
                if close[t] < psar[t]:
                    trend_up, psar[t], ep, af = False, ep, close[t], 0.01
                elif close[t] > ep:
                    ep, af = close[t], min(af + 0.01, max_af)
            else:
                if close[t] > psar[t]:
                    trend_up, psar[t], ep, af = True, ep, close[t], 0.01
                elif close[t] < ep:
                    ep, af = close[t], min(af + 0.01, max_af)
        psar_reg = 2 if close[idx] > psar[idx] else 0

        # Z-score regime
        ma90 = pd.Series(close).rolling(90).mean().iloc[idx]
        sd90 = pd.Series(close).rolling(90).std().iloc[idx]
        if np.isnan(ma90) or np.isnan(sd90):
            zscore_reg = np.nan
        else:
            z = (close[idx] - ma90) / sd90
            zscore_reg = 2 if z > 0.5 else 0 if z < -0.5 else 1

        # Weighted-return regime
        if idx >= 45:
            r = pd.Series(close).pct_change().iloc[idx - 44 : idx + 1].values
            wr = np.dot(r, sqrt_weights)
            wret_reg = 2 if wr > 0 else 0 if wr < 0 else 1
        else:
            wret_reg = np.nan

        out[i] = [
            ma_reg, ema_reg, sv_reg, macd_reg, kalman_reg,
            fib_reg, psar_reg, zscore_reg, wret_reg,
        ]

    return out


# ──────────────────── I/O wrappers for prices.txt ───────────────────────
def _extract_window(price_file: str,
                    timestep: int,
                    win_len: int = 100) -> np.ndarray:
    """
    Slice the latest `win_len` bars (inclusive) ending at `timestep` from the
    price file and transpose to (n_inst, win_len).
    """
    df = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_inst = df.shape

    if not (0 <= timestep < n_rows):
        raise ValueError(f"timestep {timestep} out of range (0 … {n_rows-1})")
    if timestep < win_len - 1:
        raise ValueError("Not enough history to build a 100-bar window.")

    slice_df = df.iloc[timestep - win_len + 1 : timestep + 1, :]
    return slice_df.to_numpy().T            # (n_inst, win_len)


def infer_from_file(price_file: str,
                    timestep: int) -> np.ndarray:
    """
    High-level convenience wrapper:
    1. read prices.txt
    2. build the (50,100) window ending at `timestep`
    3. run the regime-feature pipeline
    """
    window = _extract_window(price_file, timestep, win_len=100)
    return compute_regime_features_window(window)


In [None]:
import numpy as np

def load_prices_to_array(
    price_file: str,
    endpoint: int,
    delim_whitespace: bool = True
) -> np.ndarray:
    """
    Reads a text file where each column is one instrument and each row is a timestep,
    then returns a 2D NumPy array of shape (endpoint+1, n_inst),
    i.e. all rows from 0 up to and including `endpoint`.

    Args:
        price_file: Path to your prices.txt file.
        endpoint:   Zero-based index of the last row to include.
        delim_whitespace: If True, splits on any whitespace (default).
    """
    if delim_whitespace:
        data = np.loadtxt(price_file)
    else:
        data = np.loadtxt(price_file, delimiter=',')
    # slice to keep only timesteps 0..endpoint
    if endpoint < 0 or endpoint >= data.shape[0]:
        raise IndexError(f"endpoint {endpoint} is out of bounds for data with {data.shape[0]} rows")
    return data[: endpoint + 1, :]

# usage:
prices_array = load_prices_to_array("prices.txt", endpoint=300)
print(prices_array)
print(prices_array.shape)  # (750, number_of_instruments)

# then call your feature‐matrix builder as before:
build_feature_matrix_from_array(prices_array)
