In [None]:
# regime_inference.py
import numpy as np
import pandas as pd

def _ols_slope(y: np.ndarray) -> float:
    t = np.arange(len(y))
    X = np.vstack([t, np.ones_like(t)]).T
    m, _ = np.linalg.lstsq(X, y, rcond=None)[0]
    return m


def _slope_vol_reg(close: np.ndarray,
                   idx:   int,
                   slope_win: int = 30,
                   vol_win:   int = 100
                  ) -> float | int:
    logp = np.log(close)

    # 1) slope
    slope_series = (
        pd.Series(logp)
          .rolling(slope_win, min_periods=slope_win)
          .apply(_ols_slope, raw=True)
    )
    rtn        = pd.Series(logp).diff()
    vol_series = rtn.rolling(vol_win, min_periods=1).std()

    slope = slope_series.iloc[idx]
    vol   = vol_series.iloc[idx]
    if np.isnan(slope) or np.isnan(vol):
        return np.nan

    # 3) causal median of vol_series up to idx
    median_vol = vol_series.iloc[: idx + 1].median()

    return 2 if (slope > 0 and vol < median_vol) else 0



def compute_regime_features_window(prices_window: np.ndarray) -> np.ndarray:

    n_inst, win_len = prices_window.shape
    idx = win_len - 1                   

    out = np.full((n_inst, 9), np.nan)
    sqrt_weights = np.arange(1, 46, dtype=float) ** 0.5
    sqrt_weights /= sqrt_weights.sum()

    for i in range(n_inst):
        close = prices_window[i]
        logp  = np.log(close)

        # MA regime
        ma_s = pd.Series(logp).rolling(5).mean().iloc[idx]
        ma_l = pd.Series(logp).rolling(70).mean().iloc[idx]
        ma_reg = 0 if ma_l > ma_s else 2

        # EMA regime
        ema_s = pd.Series(logp).ewm(span=5,  adjust=False).mean().iloc[idx]
        ema_l = pd.Series(logp).ewm(span=50, adjust=False).mean().iloc[idx]
        ema_reg = 2 if ema_s > ema_l else 0

        # Slope/Vol regime
        sv_reg = _slope_vol_reg(close, idx)

        # MACD regime
        macd_line = (
            pd.Series(logp).ewm(50, adjust=False).mean()
            - pd.Series(logp).ewm(90, adjust=False).mean()
        )
        signal_line = macd_line.ewm(span=40, adjust=False).mean()
        macd_reg = 2 if macd_line.iloc[idx] > signal_line.iloc[idx] else 0

        # Kalman trend regime
        proc_var, meas_var = 0.01, 10.0
        x_est = np.zeros(win_len)
        P     = np.zeros(win_len)
        x_est[0], P[0] = logp[0], 1.0
        for t in range(1, win_len):
            x_pred = x_est[t - 1]
            P_pred = P[t - 1] + proc_var
            K      = P_pred / (P_pred + meas_var)
            x_est[t] = x_pred + K * (logp[t] - x_pred)
            P[t]     = (1 - K) * P_pred
        kalman_reg = 2 if logp[idx] > x_est[idx] else 0

        # Fibonacci regime
        if idx >= 50:
            win50 = close[idx - 49 : idx + 1]
            hi, lo = win50.max(), win50.min()
            rng = hi - lo
            upper, lower = lo + 0.786 * rng, lo + 0.618 * rng
            fib_reg = 2 if close[idx] > upper else 0 if close[idx] < lower else 1
        else:
            fib_reg = np.nan

        # PSAR regime
        psar = np.empty(win_len)
        trend_up, af, max_af = True, 0.01, 0.10
        ep = close[0]
        psar[0] = close[0]
        for t in range(1, win_len):
            psar[t] = psar[t - 1] + af * (ep - psar[t - 1])
            if trend_up:
                if close[t] < psar[t]:
                    trend_up, psar[t], ep, af = False, ep, close[t], 0.01
                elif close[t] > ep:
                    ep, af = close[t], min(af + 0.01, max_af)
            else:
                if close[t] > psar[t]:
                    trend_up, psar[t], ep, af = True, ep, close[t], 0.01
                elif close[t] < ep:
                    ep, af = close[t], min(af + 0.01, max_af)
        psar_reg = 2 if close[idx] > psar[idx] else 0

        # Z-score regime
        ma90 = pd.Series(close).rolling(90).mean().iloc[idx]
        sd90 = pd.Series(close).rolling(90).std().iloc[idx]
        if np.isnan(ma90) or np.isnan(sd90):
            zscore_reg = np.nan
        else:
            z = (close[idx] - ma90) / sd90
            zscore_reg = 2 if z > 0.5 else 0 if z < -0.5 else 1

        # Weighted-return regime
        if idx >= 45:
            r = pd.Series(close).pct_change().iloc[idx - 44 : idx + 1].values
            wr = np.dot(r, sqrt_weights)
            wret_reg = 2 if wr > 0 else 0 if wr < 0 else 1
        else:
            wret_reg = np.nan

        out[i] = [
            ma_reg, ema_reg, sv_reg, macd_reg, kalman_reg,
            fib_reg, psar_reg, zscore_reg, wret_reg,
        ]

    return out

def _extract_window(price_file: str,
                    timestep: int,
                    win_len: int = 100) -> np.ndarray:
    """
    Slice the latest `win_len` bars (inclusive) ending at `timestep` from the
    price file and transpose to (n_inst, win_len).
    """
    df = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_inst = df.shape

    if not (0 <= timestep < n_rows):
        raise ValueError(f"timestep {timestep} out of range (0 … {n_rows-1})")
    if timestep < win_len - 1:
        raise ValueError("Not enough history to build a 100-bar window.")

    slice_df = df.iloc[timestep - win_len + 1 : timestep + 1, :]
    return slice_df.to_numpy().T            # (n_inst, win_len)


def infer_from_file(price_file: str,
                    timestep: int) -> np.ndarray:
    """
    High-level convenience wrapper:
    1. read prices.txt
    2. build the (50,100) window ending at `timestep`
    3. run the regime-feature pipeline
    """
    window = _extract_window(price_file, timestep, win_len=100)
    #print(len(window[0]))
    return compute_regime_features_window(window)


In [None]:
import numpy as np
import pandas as pd

from precision_labeller import plot_all_regimes_long

def build_feature_label_csv(price_file: str,
                            N: int = 740,
                            output_csv: str = "features_labels.csv"):
    """
    Runs through timesteps 0..N-1 of prices.txt, builds the 9-regime features
    for each instrument at each t, pulls in the true_autolabel, and saves
    a long-form CSV indexed by instrument->time.
    """
    # 1) load prices once
    df_price = pd.read_csv(price_file, sep=r"\s+", header=None)
    n_rows, n_inst = df_price.shape
    assert N <= n_rows, f"N={N} exceeds available rows={n_rows}"

    # 2) precompute true regimes for each instrument
    #    this returns an array length N for each inst
    true_regs = {
        inst: plot_all_regimes_long(end_point=N + 10, plot_graph=False, inst=inst)
        for inst in range(n_inst)
    }

    # 3) iterate timesteps and call infer_from_file
    records = []
    for t in range(N):
        try:
            # infer_from_file expects timestep index in [0..]
            feats_t = infer_from_file(price_file, timestep=t)
            # feats_t is shape (n_inst, 9)
        except ValueError:
            # not enough history (t < 99), fill with NaNs
            feats_t = np.full((n_inst, 9), np.nan)

        for inst in range(n_inst):
            row = {
                "inst": inst,
                "time": t,
                "ma":          feats_t[inst, 0],
                "ema":         feats_t[inst, 1],
                "slope_vol":   feats_t[inst, 2],
                "macd":        feats_t[inst, 3],
                "kalman":      feats_t[inst, 4],
                "fib":         feats_t[inst, 5],    
                "psar":        feats_t[inst, 6],
                "zscore":      feats_t[inst, 7],
                "wret":        feats_t[inst, 8],
                "true_regime": true_regs[inst][t]
            }
            records.append(row)

    # 5) build DataFrame & save
    df = pd.DataFrame.from_records(records)
    df = df.sort_values(["inst", "time"]).reset_index(drop=True)
    df.to_csv(output_csv, index=False)
    print(f"Wrote {len(df)} rows to {output_csv}")




In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# ─── YOUR SAVED Bi-LSTM MODEL CLASS & LOADING ───────────────────────────────
class RegimeBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size   = input_size,
            hidden_size  = hidden_size,
            num_layers   = num_layers,
            batch_first  = True,
            dropout      = dropout,
            bidirectional= True
        )
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)      # (batch, seq_len, 2*hidden)
        out    = out[:, -1, :]     # (batch, 2*hidden)
        return self.fc(out)        # (batch, num_classes)


# ─── CONFIGURATION ─────────────────────────────────────────────────────────
PRICE_FILE   = "prices.txt"
MODEL_PATH   = "bilstm_self2.pth"
START        = 120
END          = 750
SEQ_LEN      = 20
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
FEAT_DIM     = 9
HIDDEN_SIZE  = 64
NUM_LAYERS   = 2
NUM_CLASSES  = 3     # 0=bear,1=neutral,2=bull

# ─── 1) LOAD MODEL ──────────────────────────────────────────────────────────
print(f"Loading model weights from {MODEL_PATH}...")
model = RegimeBiLSTM(FEAT_DIM, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES, dropout=0.2)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE).eval()
print("Model loaded. Starting inference run.\n")

# ─── 2) LOAD PRICES ─────────────────────────────────────────────────────────
prices_raw = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None).values
T, N_INST  = prices_raw.shape
print(f"Price data dimensions: T={T}, instruments={N_INST}\n")

# ─── 3) RUN INFERENCE STREAM ────────────────────────────────────────────────
cache = []
predictions = {inst: [] for inst in range(N_INST)}

with torch.no_grad():
    total = END - START + 1
    for step, t in enumerate(range(START, END+1), 1):
        feats_t = infer_from_file(PRICE_FILE, timestep=t)  # (N_INST,9)

        cache.append(feats_t)
        if len(cache) < SEQ_LEN:
            print(f"[{step}/{total}] warming cache {len(cache)}/{SEQ_LEN}", end="\r")
            continue
        if len(cache) > SEQ_LEN:
            cache.pop(0)

        stacked = np.stack(cache, axis=0)               # (SEQ_LEN,N_INST,9)
        seqs    = np.transpose(stacked, (1,0,2))        # (N_INST,SEQ_LEN,9)
        Xb      = torch.from_numpy(seqs).float().to(DEVICE)

        logits  = model(Xb)                             # (N_INST,3)
        preds   = logits.argmax(dim=1).cpu().numpy()    # (N_INST,)

        for inst in range(N_INST):
            predictions[inst].append((t, int(preds[inst])))

        print(f"[{step}/{total}] t={t:4d}", end="\r")
    print("\nInference complete!\n")


# ─── 4) MINIMUM‐RUN SMOOTHER ─────────────────────────────────────────────────
def smooth_min_run(raw: np.ndarray, L: int=4) -> np.ndarray:
    """
    Enforce that you only flip to a new label if you see it run >= L times.
    Otherwise you stay in the previous label.
    """
    sm = np.empty_like(raw)
    # helper to get raw runs
    changes = np.flatnonzero(raw[:-1] != raw[1:])
    starts  = np.concatenate(([0], changes+1))
    ends    = np.concatenate((changes, [len(raw)-1]))

    # first run: accept whatever it is
    s0,e0 = starts[0], ends[0]
    curr = raw[s0]
    sm[s0:e0+1] = curr

    # subsequent runs
    for s,e in zip(starts[1:], ends[1:]):
        lbl = raw[s]
        run_len = e - s + 1
        if lbl != curr and run_len >= L:
            curr = lbl
        sm[s:e+1] = curr

    return sm

#For plotting logic below -> May need to comment out
def get_segments(label_seq):
    """Turn 1D label array into runs: [(start,end,label),…]."""
    changes = np.flatnonzero(label_seq[:-1] != label_seq[1:])
    starts  = np.concatenate(([0], changes+1))
    ends    = np.concatenate((changes, [len(label_seq)-1]))
    return list(zip(starts, ends, label_seq[starts]))


true_cmap = ListedColormap(["#ffcccc","#f0f0f0","#ccffcc"])
pred_cmap = ListedColormap(["#ff6666","#b0b0b0","#66cc66"])

for inst in range(N_INST):
    print(f"Plotting inst {inst+1}/{N_INST}…")
    times, raw_labs = zip(*predictions[inst])
    times      = np.array(times)
    raw_labs   = np.array(raw_labs)
    labs       = smooth_min_run(raw_labs, L=4)
    price_slice = prices_raw[times, inst]
    x = np.arange(len(times))

    fig, ax = plt.subplots(figsize=(12,4))
    for s,e,lbl in zip(*[ *zip(*get_segments(labs)) ]):
        # bear=0, bull=2
        if lbl==2:
            ax.axvspan(x[s], x[e], color=pred_cmap.colors[2], alpha=0.3, lw=0)
        elif lbl==0:
            ax.axvspan(x[s], x[e], color=pred_cmap.colors[0], alpha=0.3, lw=0)

    ax.plot(x, price_slice, "k-", label="Price")
    ax.set_title(f"Instrument {inst:02d} — Smoothed preds {START}→{END}")
    ax.set_xlabel("Index in window")
    ax.set_ylabel("Price")
    ax.legend()
    plt.tight_layout()
    plt.show()

print("All plots done.")
