In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

def find_local_extrema(series: np.ndarray):
    peaks, troughs = [], []
    for i in range(1, len(series) - 1):
        if series[i] > series[i-1] and series[i] > series[i+1]:
            peaks.append(i)
        elif series[i] < series[i-1] and series[i] < series[i+1]:
            troughs.append(i)
    return peaks, troughs

def detect_head_and_shoulders(series: np.ndarray, shoulder_tol: float = 0.03):
    peaks, troughs = find_local_extrema(series)
    patterns = []
    for idx in range(len(peaks) - 2):
        i1, i2, i3 = peaks[idx], peaks[idx+1], peaks[idx+2]
        # find the lowest troughs between shoulders–head and head–shoulder
        v1s = [t for t in troughs if i1 < t < i2]
        v2s = [t for t in troughs if i2 < t < i3]
        if not v1s or not v2s:
            continue
        v1 = min(v1s, key=lambda t: series[t])
        v2 = min(v2s, key=lambda t: series[t])
        p1, p2, p3 = series[i1], series[i2], series[i3]
        # head must be higher than both shoulders
        if not (p2 > p1 and p2 > p3):
            continue
        # shoulders roughly equal height
        if abs(p1 - p3)/p2 > shoulder_tol:
            continue
        # troughs must be below their adjacent shoulders
        if series[v1] >= p1 or series[v2] >= p3:
            continue
        patterns.append((i1, v1, i2, v2, i3))
    return patterns

def backtest_hs(series: np.ndarray, patterns: list):
    """
    For each H&S pattern tuple (i1, v1, i2, v2, i3):
      - Entry at i3+1 (short)
      - Profit target = min(price[v1], price[v2])
      - Stop loss = price[i2]
      - Exit on first bar hitting either, or at end_of_series
    Returns a list of dicts with trade details.
    """
    trades = []
    n = len(series)
    for (i1, v1, i2, v2, i3) in patterns:
        entry_idx = i3 + 1
        if entry_idx >= n:
            continue  # no bar to enter
        entry_price = series[entry_idx]
        target_price = min(series[v1], series[v2])   # where to cover
        stop_price   = series[i2]                    # head price
        
        exit_idx = n - 1
        exit_price = series[exit_idx]
        reason = "EOD"
        
        # scan forward for exit
        for t in range(entry_idx + 1, n):
            p = series[t]
            # short: profit if price ≤ target
            if p <= target_price:
                exit_idx, exit_price, reason = t, p, "TP"
                break
            # stop-loss if price ≥ stop_price
            if p >= stop_price:
                exit_idx, exit_price, reason = t, p, "SL"
                break
        
        pnl = entry_price - exit_price
        rtn = pnl / entry_price
        trades.append({
            "entry_idx": entry_idx,
            "entry_price": entry_price,
            "exit_idx": exit_idx,
            "exit_price": exit_price,
            "reason": reason,
            "pnl": pnl,
            "return": rtn
        })
    return trades

def main():
    # 1) locate and load prices
    cwd = Path.cwd()
    project_root = cwd.parents[1]   # adjust if needed
    prices_path = project_root / "prices.txt"
    df = pd.read_csv(prices_path, sep=r"\s+", header=None)
    
    # 2) only first 800 bars
    df = df.iloc[:800]
    
    all_results = {}
    overall_trades = []
    
    # 3) per instrument
    for inst in df.columns:
        series = df[inst].values
        patterns = detect_head_and_shoulders(series)
        trades = backtest_hs(series, patterns)
        overall_trades.extend(trades)
        
        # summary stats
        total_pnl = sum(t["pnl"] for t in trades)
        wins = sum(1 for t in trades if t["pnl"] > 0)
        losses = sum(1 for t in trades if t["pnl"] <= 0)
        win_rate = wins / len(trades) if trades else np.nan
        
        all_results[inst] = {
            "n_patterns": len(patterns),
            "n_trades":   len(trades),
            "total_pnl":  total_pnl,
            "win_rate":   win_rate
        }
        
        # print per-instrument
        print(f"Instrument {inst:2d} | "
              f"Patterns: {len(patterns):2d} | "
              f"Trades: {len(trades):2d} | "
              f"Total PnL: {total_pnl:8.2f} | "
              f"Win rate: {win_rate:.0%}")
    
    # 4) overall summary
    if overall_trades:
        total_pnl = sum(t["pnl"] for t in overall_trades)
        wins = sum(1 for t in overall_trades if t["pnl"] > 0)
        total = len(overall_trades)
        print("\n=== Overall ===")
        print(f"Trades: {total} | Total PnL: {total_pnl:.2f} | Win rate: {wins/total:.0%}")
    else:
        print("No trades were generated.")

if __name__ == "__main__":
    main()


In [None]:
#Its ass in practice ignore this above code

In [None]:
#!/usr/bin/env python
"""
Bayesian Optimization of MA‐crossover parameters using prices.txt.

Uses scikit‐optimize’s gp_minimize to find the (short_w,long_w)
that maximizes annualized Sharpe on instrument 0.
"""

import numpy as np
import pandas as pd
from skopt import gp_minimize
from skopt.space import Integer
from skopt.utils import use_named_args
from pathlib import Path

# --------------------------------------------------------------------
# 1) Locate and load prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found in this directory or any parent")

print(f"Loading prices.txt from: {prices_path}")
df = pd.read_csv(prices_path, sep=r"\s+", header=None)

# we'll optimize on the first instrument
prices      = df.iloc[:, 0]
log_returns = pd.Series(np.log(prices)).diff().dropna()

# --------------------------------------------------------------------
# 2) Define backtest function returning annualized Sharpe
def backtest_ma(short_w: int, long_w: int) -> float:
    logp   = pd.Series(np.log(prices))
    ma_s   = logp.rolling(short_w, min_periods=1).mean()
    ma_l   = logp.rolling(long_w, min_periods=1).mean()

    # make signals a pandas Series so we can shift it
    signals = pd.Series(
        np.where(ma_s > ma_l, 1.0, -1.0),
        index=logp.index
    )

    # shift to avoid lookahead, then align with log_returns
    strat_ret = signals.shift(1).loc[log_returns.index] * log_returns

    mu, sigma = strat_ret.mean(), strat_ret.std()
    # annualize assuming 252 trading days
    return (mu / sigma) * np.sqrt(252)

# --------------------------------------------------------------------
# 3) Define the search space and objective (minimize −Sharpe)
space = [
    Integer(5,  50,  name="short_w"),
    Integer(50, 200, name="long_w"),
]

@use_named_args(space)
def objective(short_w, long_w):
    # gp_minimize *minimizes*, so return negative Sharpe
    return -backtest_ma(short_w, long_w)

# --------------------------------------------------------------------
# 4) Run the Bayesian optimization
res = gp_minimize(
    func=objective,
    dimensions=space,
    acq_func="EI",    # Expected Improvement
    n_calls=30,
    random_state=42,
)

# --------------------------------------------------------------------
# 5) Display results
best_short, best_long = res.x
best_sharpe = -res.fun

print("=== Bayesian Optimization Results ===")
print(f"Best fast MA window  : {best_short}")
print(f"Best slow MA window  : {best_long}")
print(f"Max annualized Sharpe: {best_sharpe:.2f}")


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# PARAMETERS — adjust as needed
N_DAYS         = 750    # how many rows (days) to plot
N_INSTRUMENTS  = 11     # how many columns (instruments) to use
LINREG_WINDOW  = 5     # length of rolling linear regression
SMA_WINDOW     = 200    # length of simple moving average

cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found in this directory or any parent")

print(f"Loading prices.txt from: {prices_path}")
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

# 2) For each instrument, compute linreg, SMA, simulate positions, and plot
for inst in df.columns:
    prices = df[inst].values
    n = len(prices)

    # 2a) rolling linear regression (fits on last LINREG_WINDOW days)
    linreg = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    model = LinearRegression()
    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        model.fit(X_master, y)
        linreg[t] = model.predict([[LINREG_WINDOW - 1]])[0]

    # 2b) rolling SMA
    sma = pd.Series(prices).rolling(window=SMA_WINDOW, min_periods=1).mean().values

    # 2c) simulate your entry/exit logic
    pos = np.zeros(n, dtype=int)   # 0=flat, +1=long, -1=short
    current = 0
    for t in range(n):
        p, lr, ma = prices[t], linreg[t], sma[t]
        if not np.isnan(lr):
            if current == 0:
                if p < lr and p > ma:
                    current = +1
                elif p > lr and p < ma:
                    current = -1
            elif current == +1 and p > lr:
                current = 0
            elif current == -1 and p < lr:
                current = 0
        pos[t] = current

    # 3) Plot
    fig, ax = plt.subplots(figsize=(12, 4))
    days = np.arange(n)

    ax.plot(days, prices, color="black", label="Price")
    # green for long
    ax.fill_between(
        days, prices.min(), prices.max(),
        where=pos == +1,
        facecolor="green",
        alpha=0.3,
        interpolate=True
    )
    # red for short
    ax.fill_between(
        days, prices.min(), prices.max(),
        where=pos == -1,
        facecolor="red",
        alpha=0.3,
        interpolate=True
    )

    ax.set_title(f"Instrument {inst} — Price with Long (green) & Short (red)")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# PARAMETERS — adjust as needed
N_DAYS         = 750    # how many rows (days) to plot
N_INSTRUMENTS  = 11    # how many columns (instruments) to use
LINREG_WINDOW  = 5      # length of rolling linear regression
SMA_WINDOW     = 200    # length of simple moving average
VOL_MULTIPLIER = 0.5    # exit if one-day move exceeds this × σ

# find prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

print(f"Loading prices.txt from: {prices_path}")
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

for inst in df.columns:
    prices = df[inst].values
    n      = len(prices)

    # ── 1) rolling linear regression ────────────────────────────────────────
    linreg   = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    model    = LinearRegression()

    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        model.fit(X_master, y)
        linreg[t] = model.predict([[LINREG_WINDOW - 1]])[0]

    # ── 2) rolling SMA ─────────────────────────────────────────────────────
    sma = pd.Series(prices).rolling(window=SMA_WINDOW, min_periods=1).mean().values

    # ── 3) compute returns & rolling vol for shock exits ──────────────────
    rets        = pd.Series(prices).pct_change().fillna(0).values
    rolling_vol = pd.Series(rets).rolling(window=LINREG_WINDOW, min_periods=1).std().values

    # ── 4) simulate positions with shock‐exit ──────────────────────────────
    pos     = np.zeros(n, dtype=int)   # 0=flat, +1=long, -1=short
    current = 0

    for t in range(n):
        p, lr, ma = prices[t], linreg[t], sma[t]
        # is today’s move a “shock” exit?
        shock_exit      = (rets[t] < -VOL_MULTIPLIER * rolling_vol[t])
        shock_exit_short= (rets[t] >  VOL_MULTIPLIER * rolling_vol[t])

        if not np.isnan(lr):
            if current == 0:
                if p < lr and p > ma:
                    current = +1
                elif p > lr and p < ma:
                    current = -1

            elif current == +1:
                # exit long on linreg cross OR shock drop
                if p > lr or shock_exit:
                    current = 0

            elif current == -1:
                # exit short on linreg cross OR shock rally
                if p < lr or shock_exit_short:
                    current = 0

        pos[t] = current

    # ── 5) plot ─────────────────────────────────────────────────────────────
    fig, ax = plt.subplots(figsize=(12, 4))
    days = np.arange(n)

    ax.plot(days, prices, color="black", label="Price")
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == +1, facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == -1, facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Instrument {inst} — Price with Long (green) & Short (red)")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# PARAMETERS — adjust as needed
N_DAYS          = 750    # how many rows (days) to plot
N_INSTRUMENTS   = 11      # how many columns (instruments) to use
LINREG_WINDOW   = 5      # length of rolling linear regression
SMA_WINDOW      = 200    # length of simple moving average
TRAIL_PCT       = 0.001   # 3% trailing stop-loss

# find prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

print(f"Loading prices.txt from: {prices_path}")
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

for inst in df.columns:
    prices = df[inst].values
    n      = len(prices)

    # 1) rolling linear regression
    linreg   = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    model    = LinearRegression()

    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        model.fit(X_master, y)
        linreg[t] = model.predict([[LINREG_WINDOW - 1]])[0]

    # 2) rolling SMA
    sma = pd.Series(prices).rolling(window=SMA_WINDOW, min_periods=1).mean().values

    # 3) simulate positions with trailing stop-loss
    pos               = np.zeros(n, dtype=int)  # 0=flat, +1=long, -1=short
    current           = 0
    high_since_entry  = np.nan
    low_since_entry   = np.nan

    for t in range(n):
        p, lr, ma = prices[t], linreg[t], sma[t]

        if not np.isnan(lr):
            if current == 0:
                # entry logic
                if p < lr and p > ma:
                    current          = +1
                    high_since_entry = p
                elif p > lr and p < ma:
                    current         = -1
                    low_since_entry = p

            elif current == +1:
                # update running high
                high_since_entry = max(high_since_entry, p)
                # exit on linreg cross or trailing stop
                if p > lr or p <= high_since_entry * (1 - TRAIL_PCT):
                    current = 0

            elif current == -1:
                # update running low
                low_since_entry = min(low_since_entry, p)
                # exit on linreg cross or trailing stop
                if p < lr or p >= low_since_entry * (1 + TRAIL_PCT):
                    current = 0

        pos[t] = current

    # 4) plot
    fig, ax = plt.subplots(figsize=(12, 4))
    days = np.arange(n)

    ax.plot(days, prices, color="black", label="Price")
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == +1, facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == -1, facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Instrument {inst} — Price with Long (green) & Short (red)")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# PARAMETERS — adjust as needed
N_DAYS         = 750    # how many rows (days) to plot
N_INSTRUMENTS  =11      # how many columns (instruments) to use
LINREG_WINDOW  = 5      # length of rolling linear regression
EMA_SPAN       = 100    # span for exponential moving average

# find prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found in this directory or any parent")

print(f"Loading prices.txt from: {prices_path}")
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

for inst in df.columns:
    prices = df[inst].values
    n      = len(prices)

    # 1) rolling linear regression
    linreg   = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    model    = LinearRegression()

    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        model.fit(X_master, y)
        linreg[t] = model.predict([[LINREG_WINDOW - 1]])[0]

    # 2) rolling EMA
    ema = pd.Series(prices).ewm(span=EMA_SPAN, adjust=False).mean().values

    # 3) simulate your entry/exit logic
    pos     = np.zeros(n, dtype=int)   # 0=flat, +1=long, -1=short
    current = 0

    for t in range(n):
        p, lr, e = prices[t], linreg[t], ema[t]
        if not np.isnan(lr):
            if current == 0:
                if p < lr and p > e:
                    current = +1
                elif p > lr and p < e:
                    current = -1
            elif current == +1 and p > lr:
                current = 0
            elif current == -1 and p < lr:
                current = 0
        pos[t] = current

    # 4) Plot
    fig, ax = plt.subplots(figsize=(12, 4))
    days = np.arange(n)

    ax.plot(days, prices, color="black", label="Price")
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == +1, facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == -1, facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Instrument {inst} — Price with Long (green) & Short (red)")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# PARAMETERS — tweak to taste
N_DAYS        = 750    # days to plot
N_INSTRUMENTS = 50     # instruments (columns)
LINREG_WINDOW = 5      # lin-reg window length
EMA_SPAN      = 100    # EMA span
MIN_RUN       = 20     # minimum days in a regime to show it

# locate prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# load & trim
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

for inst in df.columns:
    prices = df[inst].values
    n      = len(prices)

    # 1) rolling lin-reg
    linreg   = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    lr_model = LinearRegression()
    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        lr_model.fit(X_master, y)
        linreg[t] = lr_model.predict([[LINREG_WINDOW - 1]])[0]

    # 2) rolling EMA
    ema = pd.Series(prices).ewm(span=EMA_SPAN, adjust=False).mean().values

    # 3) build raw pos + fallback of flats
    pos     = np.zeros(n, dtype=int)
    current = 0
    for t in range(n):
        p, lr, e = prices[t], linreg[t], ema[t]
        if not np.isnan(lr):
            if current == 0:
                if p < lr and p > e:
                    current = +1
                elif p > lr and p < e:
                    current = -1
            elif current == +1 and p > lr:
                current = 0
            elif current == -1 and p < lr:
                current = 0
        pos[t] = current

    # fallback: fill flats (0) with last non-zero
    pos_fallback = pos.copy()
    last = 0
    for t in range(n):
        if pos_fallback[t] == 0:
            pos_fallback[t] = last
        else:
            last = pos_fallback[t]

    # 4) enforce minimum run length via rolling median smoothing
    #    short flips (< MIN_RUN days) will get rounded away
    pos_clean = (
        pd.Series(pos_fallback)
          .rolling(window=MIN_RUN, center=True, min_periods=1)
          .median()
          .round()
          .astype(int)
          .values
    )

    # 5) plot
    days = np.arange(n)
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot( days, prices, color="black", label="Price" )
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos_clean == +1, facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos_clean == -1, facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Instrument {inst} — Regimes (min run = {MIN_RUN} days)")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST  = 5    # how many instruments (columns) to plot
WINDOW    = 20   # length of each regression window
EWM_SPAN  = 10   # span for exponential weighting (alpha = 2/(span+1))

# ─── find prices.txt ───────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── load data ──────────────────────────────────────────────────────────────
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:, :NUM_INST]  # take first NUM_INST columns
n  = df.shape[0]

# ─── helper: EWM‐weighted regression slope ──────────────────────────────────
def ewm_slope(arr: np.ndarray, span: int) -> float:
    """
    Fit a weighted linear regression on arr with exponentially
    decaying weights (newest points heaviest). Return the slope.
    """
    m = len(arr)
    x = np.arange(m, dtype=float)
    # compute decay factor alpha
    alpha = 2.0 / (span + 1.0)
    # weight for each point: newest (i=m-1) gets (1-alpha)^0,
    # oldest (i=0) gets (1-alpha)^(m-1)
    weights = (1 - alpha) ** (m - 1 - x)
    # fit weighted least‐squares
    model = LinearRegression()
    model.fit(x.reshape(-1, 1), arr, sample_weight=weights)
    return model.coef_[0]

# ─── loop instruments & plot ───────────────────────────────────────────────
for inst in df.columns:
    prices = df[inst].values
    pos     = np.zeros(n, dtype=int)  # +1=long, -1=short

    # compute position each day
    for t in range(n):
        w = WINDOW if t+1 >= WINDOW else t+1
        windowed = prices[t-w+1 : t+1]
        slope = ewm_slope(windowed, EWM_SPAN)
        pos[t] = +1 if slope > 0 else -1

    # plot
    days = np.arange(n)
    fig, ax = plt.subplots(figsize=(12, 4))

    ax.plot(days, prices, color="black", label="Price")
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == +1, facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos == -1, facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Instrument {inst} — Long/Short via {WINDOW}-day EWM‐Reg")
    ax.set_xlim(0, n-1)
    ax.set_xlabel("Day")
    ax.set_ylabel("Price")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST = 5     # how many instruments (columns) to plot
WINDOW   = 5     # window size for the normalized linear regression
T_START  = 120   # first day to plot (inclusive)
T_END    = 250   # last day to plot  (exclusive)

# ─── find prices.txt ───────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── load data ──────────────────────────────────────────────────────────────
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:, :NUM_INST]  # take first NUM_INST columns
n  = df.shape[0]

# ─── helper: normalized‐regression slope ────────────────────────────────────
def norm_slope(arr: np.ndarray) -> float:
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    lr = LinearRegression().fit(x.reshape(-1, 1), y)
    return lr.coef_[0]

# ─── loop instruments & plot ───────────────────────────────────────────────
for inst in df.columns:
    prices = df[inst].values
    pos     = np.zeros(n, dtype=int)  # +1=long, -1=short

    # compute pos each day
    for t in range(n):
        w = WINDOW if t+1 >= WINDOW else t+1
        windowed = prices[t-w+1 : t+1]
        pos[t] = +1 if norm_slope(windowed) > 0 else -1

    # restrict to [T_START, T_END)
    days_slice   = np.arange(n)[T_START:T_END]
    prices_slice = prices[T_START:T_END]
    pos_slice    = pos[T_START:T_END]

    # plot
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(days_slice, prices_slice, color="black", label="Price")
    ax.fill_between(days_slice,
                    prices_slice.min(), prices_slice.max(),
                    where=pos_slice == +1,
                    facecolor="green", alpha=0.3,
                    interpolate=True)
    ax.fill_between(days_slice,
                    prices_slice.min(), prices_slice.max(),
                    where=pos_slice == -1,
                    facecolor="red", alpha=0.3,
                    interpolate=True)

    ax.set_title(f"Inst {inst} — Long/Short via {WINDOW}-day Norm-Reg (t={T_START}–{T_END})")
    ax.set_xlim(T_START, T_END-1)
    ax.set_xlabel("Day")
    ax.set_ylabel("Price")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST = 5    # how many instruments (columns) to plot
WINDOW   = 10    # window size for the normalized linear regression
T1       = 100  # first timestep to plot (inclusive)
T2       = 300  # last  timestep to plot (exclusive)

# ─── find prices.txt ───────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── load data ──────────────────────────────────────────────────────────────
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:, :NUM_INST]  # take first NUM_INST columns
n  = df.shape[0]

# ─── helper: normalized‐regression slope ────────────────────────────────────
def norm_slope(arr: np.ndarray) -> float:
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    lr = LinearRegression().fit(x.reshape(-1, 1), y)
    return lr.coef_[0]

# ─── loop instruments & plot ───────────────────────────────────────────────
for inst in df.columns:
    prices = df[inst].values

    # 1) compute slope at each day
    slopes = np.zeros(n, dtype=float)
    for t in range(n):
        w        = WINDOW if t+1 >= WINDOW else t+1
        windowed = prices[t-w+1 : t+1]
        slopes[t] = norm_slope(windowed)

    # 2) build colour map arrays
    pos_slopes = np.clip(slopes, 0, None)
    neg_slopes = np.clip(-slopes, 0, None)
    max_pos    = pos_slopes.max() or 1.0
    max_neg    = neg_slopes.max() or 1.0
    cmap_pos   = plt.get_cmap('Reds')
    cmap_neg   = plt.get_cmap('Blues')

    colors = []
    for s in slopes:
        if s >= 0:
            colors.append(cmap_pos(s / max_pos))
        else:
            colors.append(cmap_neg(-s / max_neg))
    colors = np.array(colors)

    # 3) slice for plotting
    days         = np.arange(n)
    days_slice   = days[T1:T2]
    prices_slice = prices[T1:T2]
    colors_slice = colors[T1:T2]

    # 4) plot background heat‐map bars + price line
    fig, ax = plt.subplots(figsize=(12, 4))

    bar_height = prices_slice.max() - prices_slice.min()
    ax.bar(days_slice,
           bar_height,
           bottom=prices_slice.min(),
           color=colors_slice,
           width=1.0,
           align='edge',
           edgecolor='none',
           alpha=0.5)

    ax.plot(days_slice, prices_slice, color="black", label="Price")

    ax.set_title(
        f"Instrument {inst} — t={T1} to {T2} Heat‐map by Slope (window={WINDOW})"
    )
    ax.set_xlim(T1, T2-1)
    ax.set_xlabel("Day")
    ax.set_ylabel("Price")
    ax.legend(loc="lower left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression

# PARAMETERS — tweak to taste
N_DAYS        = 120    # days to plot
N_INSTRUMENTS = 5     # instruments (columns)
LINREG_WINDOW = 20      # lin‐reg window length
EMA_SPAN      = 100    # EMA span
MIN_RUN       = 20     # minimum days in a regime to show it

# locate prices.txt
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# load & trim
df = pd.read_csv(prices_path, sep=r"\s+", header=None)
df = df.iloc[:N_DAYS, :N_INSTRUMENTS]

# helper: compute normalized‐slope
def norm_slope(arr: np.ndarray) -> float:
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    lr = LinearRegression().fit(x.reshape(-1, 1), y)
    return lr.coef_[0]

for inst in df.columns:
    prices = df[inst].values
    n      = len(prices)

    # 1) rolling lin‐reg
    linreg   = np.full(n, np.nan, dtype=float)
    X_master = np.arange(LINREG_WINDOW).reshape(-1, 1)
    lr_model = LinearRegression()
    for t in range(LINREG_WINDOW - 1, n):
        y = prices[t - LINREG_WINDOW + 1 : t + 1]
        lr_model.fit(X_master, y)
        linreg[t] = lr_model.predict([[LINREG_WINDOW - 1]])[0]

    # 2) rolling EMA
    ema = pd.Series(prices).ewm(span=EMA_SPAN, adjust=False).mean().values

    # 3) raw pos + fallback
    pos     = np.zeros(n, dtype=int)
    current = 0
    for t in range(n):
        p, lr, e = prices[t], linreg[t], ema[t]
        if not np.isnan(lr):
            if current == 0:
                if p < lr and p > e:
                    current = +1
                elif p > lr and p < e:
                    current = -1
            elif current == +1 and p > lr:
                current = 0
            elif current == -1 and p < lr:
                current = 0
        pos[t] = current

    # 4) fallback fill
    pos_fb = pos.copy()
    last   = 0
    for t in range(n):
        if pos_fb[t] == 0:
            pos_fb[t] = last
        else:
            last = pos_fb[t]

    # 5) enforce minimum run
    pos_clean = (
        pd.Series(pos_fb)
          .rolling(window=MIN_RUN, center=True, min_periods=1)
          .median()
          .round()
          .astype(int)
          .values
    )

    # 6) compute slope heat‐map
    slopes = np.zeros(n, dtype=float)
    for t in range(n):
        w        = LINREG_WINDOW if t+1 >= LINREG_WINDOW else t+1
        windowed = prices[t-w+1 : t+1]
        slopes[t] = norm_slope(windowed)

    # use a diverging cmap: red=down, white=zero, green=up
    cmap = plt.get_cmap("RdYlGn")
    max_abs = np.max(np.abs(slopes)) or 1.0
    norm = plt.Normalize(-max_abs, max_abs)

    # 7) plot
    days = np.arange(n)
    fig, ax = plt.subplots(figsize=(12, 4))

    # background bars coloured by slope
    bar_h = prices.max() - prices.min()
    ax.bar(days,
           bar_h,
           bottom=prices.min(),
           color=cmap(norm(slopes)),
           width=1.0,
           align="edge",
           edgecolor="none",
           alpha=0.5)

    # regime fill (outline)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos_clean == +1, facecolor="none",
                    edgecolor="green", linewidth=2, alpha=0.6)
    ax.fill_between(days, prices.min(), prices.max(),
                    where=pos_clean == -1, facecolor="none",
                    edgecolor="red", linewidth=2, alpha=0.6)

    # price line
    ax.plot(days, prices, color="black", label="Price")

    # add colourbar legend
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax, pad=0.02)
    cbar.set_label("Normalized LR slope")

    ax.set_title(f"Instrument {inst} — Regimes + Slope Heat‐map")
    ax.set_xlim(0, n)
    ax.set_ylabel("Price")
    ax.set_xlabel("Day")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST     = 50       # how many instruments (columns) to use
EMA_ALPHA    = 0.0095   # smoothing factor α
HORIZON      = 30       # how many days ahead to label up/down
SHORT_WIN    = 5        # days for “fast” normalized slope
LONG_WIN     = 30       # days for “slow” normalized slope
VOL_WIN      = 30       # days for rolling volatility
N_ESTIMATORS = 100      # trees in the forest
MAX_FEAT     = "sqrt"   # mtry analogue
RS           = 42       # random seed
N_SPLITS     = 5        # time-series CV folds

# ─── find prices.txt ───────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        prices_path = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── load price matrix ─────────────────────────────────────────────────────
df_prices = pd.read_csv(prices_path, sep=r"\s+", header=None)
df_prices = df_prices.iloc[:, :NUM_INST]    # trim to first NUM_INST
n_days, n_inst = df_prices.shape

# ─── helper: normalized slope ──────────────────────────────────────────────
def norm_slope(arr: np.ndarray) -> float:
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    return LinearRegression().fit(x.reshape(-1, 1), y).coef_[0]

# ─── loop instruments ──────────────────────────────────────────────────────
for inst in df_prices.columns:
    prices = df_prices[inst]
    # 1) EMA‐smooth entire series
    smooth = prices.ewm(alpha=EMA_ALPHA, adjust=False).mean()
    rets   = smooth.pct_change().fillna(0)

    # 2) build feature‐label table
    records = []
    for t in range(max(LONG_WIN, HORIZON), n_days - HORIZON):
        # features at time t
        fs = {}
        fs["t"] = t
        win5  = smooth.iloc[t - SHORT_WIN + 1 : t + 1].values
        win30 = smooth.iloc[t - LONG_WIN  + 1 : t + 1].values
        fs["slope_short"] = norm_slope(win5)
        fs["slope_long"]  = norm_slope(win30)
        fs["slope_diff"]  = fs["slope_long"] - fs["slope_short"]
        fs["vol30"]       = rets.iloc[t - VOL_WIN + 1 : t + 1].std()
        fs["ret1"]        = rets.iloc[t]
        # label: did smooth[t+HORIZON] > smooth[t] ?
        fs["label"] = int(smooth.iloc[t+HORIZON] > smooth.iloc[t])
        records.append(fs)

    df_feat = pd.DataFrame.from_records(records).dropna()
    X = df_feat[["slope_short","slope_long","slope_diff","vol30","ret1"]]
    y = df_feat["label"]

    # 3) time‐series CV
    tscv = TimeSeriesSplit(n_splits=N_SPLITS)
    accs = []
    for tr_idx, te_idx in tscv.split(X):
        rf = RandomForestClassifier(
            n_estimators=N_ESTIMATORS,
            max_features=MAX_FEAT,
            oob_score=True,
            random_state=RS,
        )
        rf.fit(X.iloc[tr_idx], y.iloc[tr_idx])
        y_pred = rf.predict(X.iloc[te_idx])
        accs.append(accuracy_score(y.iloc[te_idx], y_pred))

    # 4) final fit & evaluation
    rf_final = RandomForestClassifier(
        n_estimators=N_ESTIMATORS,
        max_features=MAX_FEAT,
        oob_score=True,
        random_state=RS,
    )
    rf_final.fit(X, y)
    oob = rf_final.oob_score_

    print(f"\n=== Instrument {inst} ===")
    print("CV accuracies:", ", ".join(f"{a:.3f}" for a in accs),
          f"→ mean {np.mean(accs):.3f}")
    print(f"OOB accuracy: {oob:.3f}")
    # you can also inspect feature_importances_ if you'd like
    print(classification_report(y, rf_final.predict(X), target_names=["Down","Up"]))

    # 5) full‐series prediction & plot
    days = df_feat["t"].values
    preds = rf_final.predict(X)

    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(days, smooth.iloc[days], color="black", label="Smoothed Price")
    ax.fill_between(days,
                    smooth.min(), smooth.max(),
                    where=preds==1,
                    facecolor="green", alpha=0.3,
                    interpolate=True,
                    label="Predict Up")
    ax.fill_between(days,
                    smooth.min(), smooth.max(),
                    where=preds==0,
                    facecolor="red", alpha=0.3,
                    interpolate=True,
                    label="Predict Down")

    ax.set_title(f"Instrument {inst} — 30-day RF Predictions (CV mean {np.mean(accs):.3f}, OOB {oob:.3f})")
    ax.set_xlabel("Day")
    ax.set_ylabel("Price (EMA)")
    ax.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
import time
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST      = 50       # how many columns to read from prices.txt
EMA_ALPHA     = 0.0095    # smoothing factor for EMA
HORIZON       = 30        # days ahead to predict
SHORT_WIN     = 5         # days for fast slope
LONG_WIN      = 30        # days for slow slope & EMA & vol
VOL_WIN       = 30        # days for rolling volatility
N_ESTIMATORS  = 100       # trees in the forest
MAX_FEAT      = "sqrt"    # sqrt(m) features per split
RS            = 42        # random seed
N_SPLITS      = 5         # time-series CV folds
ENTER_THRESH  = 0.6       # only enter long if P(up) > this
EXIT_THRESH   = 0.5       # exit if P(up) falls below
STOP_LOSS_PCT = 0.03      # 3% stop-loss
RISK_TARGET   = 0.01      # risk 1% of equity per trade (position sizing)
MODEL_DIR     = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

# ─── locate prices.txt ─────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        PRICES_PATH = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── HELPERS ────────────────────────────────────────────────────────────────
def norm_slope(arr: np.ndarray) -> float:
    """Min–max normalize indices & values, fit LinearRegression, return slope."""
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    return LinearRegression().fit(x.reshape(-1,1), y).coef_[0]

def calc_features(smooth: pd.Series) -> pd.DataFrame:
    """Build feature matrix & labels for entire historical smooth series."""
    rets = smooth.pct_change().fillna(0)
    records = []
    n_days = len(smooth)
    for t in range(max(LONG_WIN, VOL_WIN, HORIZON), n_days - HORIZON):
        win5   = smooth.iloc[t - SHORT_WIN + 1 : t + 1].values
        win30  = smooth.iloc[t - LONG_WIN  + 1 : t + 1].values
        vol30  = rets.iloc[t - VOL_WIN + 1 : t + 1].std()
        records.append({
            "t":           t,
            "slope_fast":  norm_slope(win5),
            "slope_slow":  norm_slope(win30),
            "slope_diff":  norm_slope(win30) - norm_slope(win5),
            "volatility":  vol30,
            "ret1":        rets.iloc[t],
            "label":       int(smooth.iloc[t + HORIZON] > smooth.iloc[t])
        })
    return pd.DataFrame.from_records(records).dropna().reset_index(drop=True)

def position_size(vol: float, risk_target: float = RISK_TARGET) -> float:
    """Position size as fraction of equity = risk_target / daily volatility."""
    return float(risk_target / vol) if vol > 0 else 0.0

# ─── MODEL PIPELINE ────────────────────────────────────────────────────────
def retrain_and_save():
    df_prices = pd.read_csv(PRICES_PATH, sep=r"\s+", header=None).iloc[:, :NUM_INST]
    for inst in df_prices.columns:
        smooth = df_prices[inst].ewm(alpha=EMA_ALPHA, adjust=False).mean()
        df_feat = calc_features(smooth)
        X = df_feat[["slope_fast","slope_slow","slope_diff","volatility","ret1"]]
        y = df_feat["label"]

        # time-series CV
        tscv = TimeSeriesSplit(n_splits=N_SPLITS)
        accs = []
        for train_idx, test_idx in tscv.split(X):
            rf = RandomForestClassifier(
                n_estimators=N_ESTIMATORS,
                max_features=MAX_FEAT,
                oob_score=True,
                random_state=RS
            )
            rf.fit(X.iloc[train_idx], y.iloc[train_idx])
            accs.append(accuracy_score(y.iloc[test_idx], rf.predict(X.iloc[test_idx])))

        # final fit
        model = RandomForestClassifier(
            n_estimators=N_ESTIMATORS,
            max_features=MAX_FEAT,
            oob_score=True,
            random_state=RS
        )
        model.fit(X, y)
        model_file = MODEL_DIR / f"rf_model_inst{inst}.joblib"
        joblib.dump(model, model_file)
        print(f"Inst {inst}: CV mean acc={np.mean(accs):.3f}, OOB acc={model.oob_score_:.3f}")


def live_signal_and_trade():
    df_prices = pd.read_csv(PRICES_PATH, sep=r"\s+", header=None).iloc[:, :NUM_INST]
    for inst in df_prices.columns:
        smooth = df_prices[inst].ewm(alpha=EMA_ALPHA, adjust=False).mean()
        df_feat = calc_features(smooth)
        today   = df_feat.iloc[-1]
        X_today = pd.DataFrame([today[["slope_fast","slope_slow","slope_diff","volatility","ret1"]].to_dict()])
        model   = joblib.load(MODEL_DIR / f"rf_model_inst{inst}.joblib")
        p_up    = model.predict_proba(X_today)[0,1]

        vol = today["volatility"]
        size = position_size(vol)
        price = smooth.iloc[int(today["t"])]

        state = getattr(live_signal_and_trade, f"state_{inst}", None)

        if state is None:
            if p_up > ENTER_THRESH:
                print(f"[Inst {inst}] ENTER LONG size={size:.2f}")
                live_signal_and_trade.__dict__[f"state_{inst}"] = ("long", price)
            elif p_up < (1 - ENTER_THRESH):
                print(f"[Inst {inst}] ENTER SHORT size={size:.2f}")
                live_signal_and_trade.__dict__[f"state_{inst}"] = ("short", price)
        else:
            pos_side, entry_price = state
            exit_conf = (p_up < EXIT_THRESH) if pos_side=="long" else (p_up > 1-EXIT_THRESH)
            stop_hit  = (price <= entry_price*(1-STOP_LOSS_PCT)) if pos_side=="long" else \
                        (price >= entry_price*(1+STOP_LOSS_PCT))
            if exit_conf or stop_hit:
                print(f"[Inst {inst}] EXIT {pos_side.upper()}")
                live_signal_and_trade.__dict__[f"state_{inst}"] = None


def plot_predictions():
    df_prices = pd.read_csv(PRICES_PATH, sep=r"\s+", header=None).iloc[:, :NUM_INST]
    for inst in df_prices.columns:
        smooth = df_prices[inst].ewm(alpha=EMA_ALPHA, adjust=False).mean()
        df_feat = calc_features(smooth)
        X = df_feat[["slope_fast","slope_slow","slope_diff","volatility","ret1"]]
        model = joblib.load(MODEL_DIR / f"rf_model_inst{inst}.joblib")
        preds = model.predict(X)
        days = df_feat["t"].values

        prices = df_prices[inst].values
        # build pos array on raw prices
        pos = np.full(len(prices), np.nan)
        for signal, t in zip(preds, days):
            pos[t] = +1 if signal==1 else -1
        pos = pd.Series(pos).ffill().fillna(0).values

        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
        # top: predictions on EMA
        ax1.plot(days, smooth.iloc[days], color="black", label="Price (EMA)")
        ax1.fill_between(days,
                         smooth.min(), smooth.max(),
                         where=preds==1,
                         facecolor="green", alpha=0.3,
                         interpolate=True,
                         label="Predict Up")
        ax1.fill_between(days,
                         smooth.min(), smooth.max(),
                         where=preds==0,
                         facecolor="red", alpha=0.3,
                         interpolate=True,
                         label="Predict Down")
        ax1.set_ylabel("Price (EMA)")
        ax1.legend(loc="upper left")

        # bottom: raw price & positions
        all_days = np.arange(len(prices))
        ax2.plot(all_days, prices, color="black", label="Raw Price")
        ax2.fill_between(all_days,
                         prices.min(), prices.max(),
                         where=pos==1,
                         facecolor="green", alpha=0.3,
                         interpolate=True)
        ax2.fill_between(all_days,
                         prices.min(), prices.max(),
                         where=pos==-1,
                         facecolor="red", alpha=0.3,
                         interpolate=True)
        ax2.set_ylabel("Raw Price")
        ax2.set_xlabel("Day")
        ax2.legend(loc="upper left")

        plt.suptitle(f"Instrument {inst} — 30-day RF Predictions & Raw Price")
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

# ─── MAIN ──────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    start = time.time()
    retrain_and_save()
    live_signal_and_trade()
    plot_predictions()
    print(f"Total runtime: {time.time()-start:.1f}s")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ─── CONFIG ────────────────────────────────────────────────────────────────
NUM_INST      = 50      # number of instruments (columns)
EMA_ALPHA     = 0.0095  # smoothing factor for EMA
HORIZON       = 5      # days ahead to predict
SHORT_WIN     = 5       # days for fast slope feature
LONG_WIN      = 30      # days for slow slope & EMA & vol
VOL_WIN       = 30      # days for rolling volatility
TRAIN_START   = 0       # first day to train on
TRAIN_END     = 600     # last day (exclusive) to train on
TEST_START    = 600     # first day to test (inclusive)
TEST_END      = 1000     # last day (exclusive) to test on
N_ESTIMATORS  = 100     # trees in the forest
MAX_FEAT      = "sqrt" # sqrt(m) features per split
RS            = 42      # random seed
MODEL_DIR     = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

# ─── locate prices.txt ─────────────────────────────────────────────────────
cwd = Path.cwd()
for folder in (cwd, *cwd.parents):
    if (folder / "prices.txt").exists():
        PRICES_PATH = folder / "prices.txt"
        break
else:
    raise FileNotFoundError("prices.txt not found")

# ─── HELPERS ────────────────────────────────────────────────────────────────
def norm_slope(arr: np.ndarray) -> float:
    """Min–max normalize indices & values, fit LinearRegression, return slope."""
    m = len(arr)
    x = np.arange(m, dtype=float)
    x = (x - x.min()) / (np.ptp(x) or 1)
    y = (arr - arr.min()) / (np.ptp(arr) or 1)
    return LinearRegression().fit(x.reshape(-1,1), y).coef_[0]


def calc_features(smooth: pd.Series) -> pd.DataFrame:
    """Compute features & labels for the EMA-smoothed series."""
    rets = smooth.pct_change().fillna(0)
    records = []
    n = len(smooth)
    for t in range(max(LONG_WIN, VOL_WIN, HORIZON), n - HORIZON):
        win5  = smooth.iloc[t - SHORT_WIN + 1 : t + 1].values
        win30 = smooth.iloc[t - LONG_WIN  + 1 : t + 1].values
        vol30 = rets.iloc[t - VOL_WIN + 1 : t + 1].std()
        records.append({
            "t":          t,
            "slope_fast": norm_slope(win5),
            "slope_slow": norm_slope(win30),
            "slope_diff": norm_slope(win30) - norm_slope(win5),
            "vol30":      vol30,
            "ret1":       rets.iloc[t],
            "label":      int(smooth.iloc[t + HORIZON] > smooth.iloc[t])
        })
    return pd.DataFrame(records).dropna().reset_index(drop=True)

# ─── TRAIN/TEST PIPELINE ───────────────────────────────────────────────────
def train_and_save():
    df = pd.read_csv(PRICES_PATH, sep=r"\s+", header=None)
    df = df.iloc[:, :NUM_INST]

    for inst in df.columns:
        series = df[inst]
        smooth = series.ewm(alpha=EMA_ALPHA, adjust=False).mean()
        df_feat = calc_features(smooth)

        # select train/test by 't' value
        train_df = df_feat[(df_feat['t'] >= TRAIN_START) & (df_feat['t'] < TRAIN_END)]
        test_df  = df_feat[(df_feat['t'] >= TEST_START)  & (df_feat['t'] < TEST_END)]

        X_train = train_df[["slope_fast","slope_slow","slope_diff","vol30","ret1"]]
        y_train = train_df["label"]
        X_test  = test_df[["slope_fast","slope_slow","slope_diff","vol30","ret1"]]
        y_test  = test_df["label"]

        rf = RandomForestClassifier(
            n_estimators=N_ESTIMATORS,
            max_features=MAX_FEAT,
            random_state=RS
        )
        rf.fit(X_train, y_train)

        y_tr = rf.predict(X_train)
        y_te = rf.predict(X_test)
        print(f"Inst {inst}: Train acc={accuracy_score(y_train,y_tr):.3f}, Test acc={accuracy_score(y_test,y_te):.3f}")

        # safe classification report
        unique = sorted(y_test.unique())
        if len(unique) > 1:
            names = ["Down" if c==0 else "Up" for c in unique]
            print(classification_report(y_test, y_te, labels=unique, target_names=names))
        else:
            print(f"Only one class present in Test for Inst {inst} (class {unique[0]}), skipping classification_report.")

        # save model
        joblib.dump(rf, MODEL_DIR / f"rf_inst{inst}.joblib")

# ─── PLOTTING ON TEST WINDOW ────────────────────────────────────────────────
def plot_test_predictions():
    df = pd.read_csv(PRICES_PATH, sep=r"\s+", header=None)
    df = df.iloc[:, :NUM_INST]

    for inst in df.columns:
        series = df[inst]
        smooth = series.ewm(alpha=EMA_ALPHA, adjust=False).mean()
        df_feat = calc_features(smooth)

        rf = joblib.load(MODEL_DIR / f"rf_inst{inst}.joblib")

        test_df = df_feat[(df_feat['t'] >= TEST_START) & (df_feat['t'] < TEST_END)]
        X_test  = test_df[["slope_fast","slope_slow","slope_diff","vol30","ret1"]]
        preds   = rf.predict(X_test)
        days    = test_df['t'].values

        # build raw pos array for full series
        prices = series.values
        pos = np.zeros(len(prices), dtype=int)
        for sig, t in zip(preds, days):
            pos[t] = 1 if sig==1 else -1
        pos = pd.Series(pos).ffill().values

        # plot
        fig, (ax1, ax2) = plt.subplots(2,1,figsize=(12,8), sharex=True)

        ax1.plot(days, smooth.iloc[days], color="black", label="EMA Price")
        ax1.fill_between(days, smooth.min(), smooth.max(), where=preds==1,
                         facecolor="green", alpha=0.3)
        ax1.fill_between(days, smooth.min(), smooth.max(), where=preds==0,
                         facecolor="red", alpha=0.3)
        ax1.set_ylabel("EMA Price")
        ax1.set_title(f"Inst {inst}: Predictions t={TEST_START}-{TEST_END}")

        all_days = np.arange(len(prices))
        ax2.plot(all_days, prices, color="black", label="Raw Price")
        ax2.fill_between(all_days, prices.min(), prices.max(), where=pos==1,
                         facecolor="green", alpha=0.3)
        ax2.fill_between(all_days, prices.min(), prices.max(), where=pos==-1,
                         facecolor="red", alpha=0.3)
        ax2.set_ylabel("Raw Price")
        ax2.set_xlabel("Day")

        plt.tight_layout()
        plt.show()

# ─── MAIN ──────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    start = time.time()
    train_and_save()
    plot_test_predictions()
    print(f"Done in {time.time()-start:.1f}s")
