In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance),
                  2=Perpendicular distance to chord,
                  3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i -
                              (x2 - x1) * data[i] +
                              x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int):
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # ─── Configuration ───────────────────────────────────────────────────────
    PRICE_FILE       = '../../prices.txt'  # (750 rows × 50 cols)
    WINDOW_SIZE      = 24
    N_PIPS           = 5
    DIST_MEASURE     = 2
    N_CLUSTERS       = 10
    CLUSTER_THRESHOLD= 0.30

    # ─── Load & pre-process ──────────────────────────────────────────────────
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)       # shape (50, 750)
    n_inst, n_t = log_prices.shape

    n_train    = 500
    train_data = log_prices[:, :n_train]
    test_data  = log_prices[:, n_train:]
    _, n_test  = test_data.shape

    # ─── 1) Train clusters on first 500 timesteps ───────────────────────────
    train_patterns = extract_pip_patterns(train_data, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    train_vecs     = list(train_patterns.keys())
    kmeans, _, _   = cluster_patterns(train_vecs, N_CLUSTERS)
    centers        = kmeans.cluster_centers_

    # ─── 2) Single pass over test set ────────────────────────────────────────
    cluster_counts_test   = Counter()
    move_counts_test      = Counter({cid: {'Up': 0, 'Down': 0}
                                     for cid in range(N_CLUSTERS)})
    cluster_patterns_test = {cid: [] for cid in range(N_CLUSTERS)}
    matched_ends_inst0    = []

    for inst in range(n_inst):
        series = test_data[inst]
        for end in range(WINDOW_SIZE, n_test):
            window   = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue

            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)

            lbl  = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            cluster_counts_test[lbl] += 1
            cluster_patterns_test[lbl].append(tuple(np.round(norm, 3)))

            move = 'Up' if vals[-1] - vals[-2] > 0 else 'Down'
            move_counts_test[lbl][move] += 1

            if inst == 0:
                matched_ends_inst0.append(end)

    # ─── 3) Print test-set cluster results ──────────────────────────────────
    print("\nTest Set: Cluster ID → Accumulated Frequency")
    for cid, freq in cluster_counts_test.most_common():
        print(f"Cluster {cid}: {freq}")

    print("\nTest Set: Cluster ID → Last-PIP Move Distribution")
    for cid, counts in move_counts_test.items():
        up, down = counts['Up'], counts['Down']
        if up + down > 0:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # ─── 4) Plot each test-set cluster patterns + centroid ──────────────────
    x = np.arange(N_PIPS)
    for cid, pats in cluster_patterns_test.items():
        if not pats:
            continue
        plt.figure(figsize=(6, 4))
        for pat in pats:
            plt.plot(x, pat, alpha=0.1)
        plt.plot(x, centers[cid], color='black', linewidth=2, label=f'Centroid {cid}')
        plt.title(f'Test Set Cluster {cid} (n={cluster_counts_test[cid]})')
        plt.xlabel('PIP Index')
        plt.ylabel('Normalized Value')
        plt.legend()
        plt.tight_layout()
        plt.show()

    # ─── 5) Plot instrument 0 price with shaded pattern windows ─────────────
    times = np.arange(n_train, n_train + n_test)
    prices0 = np.exp(test_data[0])
    plt.figure(figsize=(12, 4))
    plt.plot(times, prices0, label='Instrument 0 Price')
    for end in matched_ends_inst0:
        start = n_train + end - WINDOW_SIZE
        stop  = n_train + end
        plt.axvspan(start, stop, color='orange', alpha=0.1)
    plt.title('Instrument 0 Test-Set Price with Pattern Matches')
    plt.xlabel('Time Index')
    plt.ylabel('Price')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # ─── 6) EMA crossover rule & prediction vs actual ───────────────────────
    FAST_EMA = 12
    SLOW_EMA = 26

    ema_preds   = []
    ema_actuals = []

    for inst in range(n_inst):
        price_series = pd.Series(np.exp(test_data[inst]),
                                 index=np.arange(n_train, n_train + n_test))
        ema_f = price_series.ewm(span=FAST_EMA, adjust=False).mean()
        ema_s = price_series.ewm(span=SLOW_EMA, adjust=False).mean()

        start_t = max(FAST_EMA, SLOW_EMA) + n_train
        for t in range(start_t, n_train + n_test):
            pred   = 'Up'   if ema_f.loc[t-1] > ema_s.loc[t-1] else 'Down'
            actual = 'Up'   if price_series.loc[t] > price_series.loc[t-1] else 'Down'
            ema_preds.append(pred)
            ema_actuals.append(actual)

    ema_counts = Counter(zip(ema_preds, ema_actuals))
    print("\nEMA Crossover Predictions vs Actual Moves")
    for (p, a), cnt in ema_counts.items():
        print(f"Pred {p}, Actual {a}: {cnt}")


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1+1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2)/2)
                elif dist_measure == 2:
                    num = abs((y2 - y1)*i - (x2 - x1)*data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num/den if den else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1)*(i - x1)/(x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)

def cluster_patterns(patterns: list, n_clusters: int):
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(data)
    return kmeans

if __name__ == '__main__':
    # ─── Config ───────────────────────────────────────────────────────────────
    PRICE_FILE       = '../../prices.txt'  # (750 × 50)
    WINDOW_SIZE      = 24
    N_PIPS           = 5                   # total PIPs  → we'll use 4 for clustering
    DIST_MEASURE     = 2
    N_CLUSTERS       = 10
    CLUSTER_THRESHOLD= 0.30
    FAST_EMA, SLOW_EMA = 12, 26

    # ─── Load & prep ─────────────────────────────────────────────────────────
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    logp = np.log(df.values.T)              # shape (50, 750)
    n_inst, n_t = logp.shape

    # train/test split
    n_train    = 500
    train_data = logp[:, :n_train]
    test_data  = logp[:, n_train:]
    _, n_test  = test_data.shape

    # ─── 1) Build cluster centroids on first N_PIPS–1 PIPs ───────────────────
    train_vecs = []
    for inst in range(n_inst):
        series = train_data[inst]
        for t in range(WINDOW_SIZE, n_train):
            window   = series[t-WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < N_PIPS:
                continue
            # use only first N_PIPS-1 indices as features
            feats_idx = pips_idx[:-1]
            vals = window[feats_idx]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx - mn) if mx!=mn else np.zeros_like(vals)
            train_vecs.append(norm)

    kmeans = cluster_patterns(train_vecs, N_CLUSTERS)
    centers= kmeans.cluster_centers_

    # ─── 2) In-test evaluation: cluster on first N_PIPS-1, predict next Nth pip via EMA ─
    ema_confusion = Counter()
    for inst in range(n_inst):
        series = test_data[inst]
        # build a plain price series for EMA
        price_ser = pd.Series(np.exp(series), index=np.arange(n_train, n_train+n_test))
        ema_f = price_ser.ewm(span=FAST_EMA, adjust=False).mean()
        ema_s = price_ser.ewm(span=SLOW_EMA, adjust=False).mean()

        # slide windows
        for end in range(WINDOW_SIZE, n_test):
            window   = series[end-WINDOW_SIZE : end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < N_PIPS:
                continue

            # features = first N_PIPS-1 PIPs
            feats_idx = pips_idx[:-1]
            feat_vals = window[feats_idx]
            mn, mx    = feat_vals.min(), feat_vals.max()
            feat_norm = (feat_vals - mn)/(mx - mn) if mx!=mn else np.zeros_like(feat_vals)

            # assign cluster
            lbl = kmeans.predict([feat_norm])[0]
            dist= np.linalg.norm(feat_norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            # EMA‐based prediction at bar t–1
            t_global = n_train + end - 1
            ema_pred = 'Up' if ema_f.loc[t_global] > ema_s.loc[t_global] else 'Down'

            # actual move of the Nth pip inside the window
            prev_val = window[feats_idx[-1]]
            next_val = window[pips_idx[-1]]
            actual   = 'Up' if next_val > prev_val else 'Down'

            print(f"Cluster {lbl}, EMA_pred {ema_pred}, Actual {actual}")
            ema_confusion[(ema_pred, actual)] += 1

    # ─── 3) Print EMA confusion matrix ─────────────────────────────────────────
    print("\nEMA Crossover vs Next-PIP Move")
    total = sum(ema_confusion.values())
    for (pred, act), cnt in ema_confusion.items():
        print(f"Pred {pred}, Actual {act}: {cnt}")
    print(f"Overall accuracy: {(ema_confusion[('Up','Up')] + ema_confusion[('Down','Down')]) / total:.2%}")


In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """Extract n_pips perceptually important points (PIPs) from a 1D series."""
    if n_pips < 2: return []
    pips_x = [0, len(data)-1]
    for _ in range(2, n_pips):
        max_dist, max_idx = -1.0, None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1,y1),(x2,y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1+1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1+x2)/2)
                elif dist_measure == 2:
                    num = abs((y2-y1)*i - (x2-x1)*data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2-y1, x2-x1)
                    dist = num/den if den else 0
                else:
                    interp = y1 + (y2-y1)*(i-x1)/(x2-x1)
                    dist = abs(data[i]-interp)
                if dist > max_dist:
                    max_dist, max_idx = dist, i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def compute_indicators(price: pd.Series) -> pd.DataFrame:
    """
    Build a wide DataFrame of technicals from a close‐price series.
    Drops any lookbacks >100 and fills NaNs nicely.
    """
    df = pd.DataFrame(index=price.index)
    # ── SMAs & EMAs ─────────────────────
    sma_windows = [10, 12, 20, 25, 26, 50, 100]
    ema_windows = [12, 26, 50]
    for w in sma_windows:
        df[f"SMA_{w}"] = price.rolling(w, min_periods=1).mean()
    for w in ema_windows:
        df[f"EMA_{w}"] = price.ewm(span=w, adjust=False).mean()

    # ── price–MA diffs & MA–MA diffs ────
    for w in [12, 26, 50, 100]:
        df[f"PR_MA_{w}"] = price - df[f"SMA_{w}"]
    df["MA_DIFF_25_100"] = df["SMA_25"] - df["SMA_100"]
    df["MA_DIFF_12_26"]  = df["SMA_12"] - df["SMA_26"]

    # ── RSI ─────────────────────────────
    for w in [6, 9, 14, 21]:
        delta = price.diff()
        up    = delta.clip(lower=0)
        down  = -delta.clip(upper=0)
        ma_up   = up.rolling(w, min_periods=1).mean()
        ma_down = down.rolling(w, min_periods=1).mean()
        rs = ma_up / ma_down.replace(0, np.nan)
        df[f"RSI_{w}"] = 100 - (100/(1+rs))

    # ── SLOPE ───────────────────────────
    def _slope(y):
        t = np.arange(len(y))
        m, _ = np.linalg.lstsq(np.vstack([t, np.ones_like(t)]).T, y, rcond=None)[0]
        return m
    for w in [10, 50, 75, 100]:
        df[f"SLOPE_{w}"] = price.rolling(w, min_periods=w).apply(_slope, raw=True)

    # ── ROC, STD, ZSCORE, PCTL ──────────
    for w in [5, 10, 20, 50]:
        df[f"ROC_{w}"] = price.pct_change(w)
    for w in [10, 20, 50]:
        df[f"STD_{w}"] = price.rolling(w, min_periods=1).std()
    for w in [20, 50]:
        mu = price.rolling(w, min_periods=1).mean()
        sd = price.rolling(w, min_periods=1).std().replace(0, np.nan)
        df[f"Z_{w}"] = (price - mu) / sd
    for w in [10, 20, 50]:
        df[f"PCTL_{w}"] = price.rolling(w, min_periods=1) \
                         .apply(lambda x: (x.iloc[-1]-x.min())/(x.max()-x.min()) 
                                if x.max()!=x.min() else 0, raw=False)

    # ── UP/DOWN STREAK ───────────────────
    dif = price.diff()
    streak = [0]
    for i in range(1, len(price)):
        if dif.iat[i] > 0 and dif.iat[i-1] > 0:
            streak.append(streak[-1]+1)
        elif dif.iat[i] < 0 and dif.iat[i-1] < 0:
            streak.append(streak[-1]-1)
        else:
            streak.append(1 if dif.iat[i]>0 else -1 if dif.iat[i]<0 else 0)
    df["STREAK"] = streak

    # ── MACD ────────────────────────────
    macd_line   = df["EMA_12"] - df["EMA_26"]
    macd_signal = macd_line.ewm(span=9, adjust=False).mean()
    df["MACD"]      = macd_line
    df["MACD_SIG"]  = macd_signal
    df["MACD_HIST"] = macd_line - macd_signal

    # ── PIVOTS & BROKEN PIVOTS ─────────
    for w in [6, 10, 50, 100]:
        df[f"PIVOT_H_{w}"] = price.rolling(w, min_periods=1).max().shift(1)
        df[f"PIVOT_L_{w}"] = price.rolling(w, min_periods=1).min().shift(1)
        df[f"BRK_H_{w}"]   = (price > df[f"PIVOT_H_{w}"]).astype(int)
        df[f"BRK_L_{w}"]   = (price < df[f"PIVOT_L_{w}"]).astype(int)

    # ── CLEANUP ─────────────────────────
    # Backfill/forward‐fill and finally fill zeros for any stragglers
    df = df.fillna(method='bfill').fillna(method='ffill').fillna(0)
    return df


if __name__ == '__main__':
    # ─── Config ───────────────────────────────────────────────────────────────
    PRICE_FILE        = '../../prices.txt'
    WINDOW_SIZE       = 24
    N_PIPS            = 5
    DIST_MEASURE      = 2
    N_CLUSTERS        = 10
    CLUSTER_THRESHOLD = 0.30
    FAST_EMA, SLOW_EMA= 12, 26
    MIN_SAMPLES_LEAF  = 10
    MAX_TREE_DEPTH    = 2

    # ─── Load raw close data ─────────────────────────────────────────────────
    raw = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    raw.columns = [f"I{i}" for i in raw.columns]
    T, n_inst = raw.shape      # T=750, n_inst=50
    price_df   = raw.copy()

    # ─── Precompute all indicators per instrument ────────────────────────────
    print("🔧 Computing technical indicators…")
    feats_by_inst = {}
    for inst in price_df.columns:
        feats_by_inst[inst] = compute_indicators(price_df[inst])
    print(" → done.\n")

    # ─── Split into train (0–499) & test (500–749) ──────────────────────────
    n_train   = 500
    train_df  = np.log(price_df.iloc[:n_train].values).T    # shape (50,500)
    test_df   = np.log(price_df.iloc[n_train:].values).T    # shape (50,250)

    # ─── 1) Build PIP‐vectors & cluster on train set ─────────────────────────
    print("🔧 Extracting PIP‐patterns from training set…")
    train_vecs = []
    for inst_idx, inst in enumerate(price_df.columns):
        series = train_df[inst_idx]
        for t in range(WINDOW_SIZE, n_train):
            win = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]   # first N_PIPS-1 points
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            train_vecs.append(norm)
    print(f" → collected {len(train_vecs)} windows\n")

    print("🔧 Fitting KMeans…")
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0)
    kmeans.fit(train_vecs)
    centers = kmeans.cluster_centers_

    # quick bar‐chart of cluster sizes
    ct = Counter(kmeans.labels_)
    plt.figure(figsize=(5,3))
    plt.bar(ct.keys(), ct.values(), color='C0', alpha=0.7)
    plt.title("Train‐set PIP‐windows per cluster")
    plt.xlabel("Cluster"); plt.ylabel("Count")
    plt.tight_layout(); plt.show()

    # ─── 2) Gather per‐cluster training rows for Decision Trees ──────────────
    print("\n🔧 Building per‐cluster training datasets…")
    X_by_cluster = defaultdict(list)
    y_by_cluster = defaultdict(list)

    # precompute EMAs on real price for train
    price_train_df = price_df.iloc[:n_train]
    ema_f_train = price_train_df.ewm(span=FAST_EMA, adjust=False).mean()
    ema_s_train = price_train_df.ewm(span=SLOW_EMA, adjust=False).mean()

    for inst_idx, inst_col in enumerate(price_df.columns):
        log_series = train_df[inst_idx]
        for t in range(WINDOW_SIZE, n_train-1):
            win = log_series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            # 1) cluster
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            lbl    = kmeans.predict([norm])[0]
            if np.linalg.norm(norm - centers[lbl]) > CLUSTER_THRESHOLD:
                continue

            # 2) features = precomputed at t-1
            feats = feats_by_inst[inst_col].iloc[t-1].values

            # 3) label = next‐bar Up/Down
            actual = 1 if log_series[t] > log_series[t-1] else 0

            X_by_cluster[lbl].append(feats)
            y_by_cluster[lbl].append(actual)

    # report cluster sizes
    for cid in range(N_CLUSTERS):
        print(f" Cluster {cid:>2d}: {len(y_by_cluster[cid])} samples")

    # ─── 3) Train a tiny Decision Tree per cluster ──────────────────────────
    print("\n🔧 Training Decision Trees per cluster…")
    models = {}
    for cid, Xs in X_by_cluster.items():
        Ys = y_by_cluster[cid]
        if len(Ys) < MIN_SAMPLES_LEAF*2:
            print(f"  · skipping cluster {cid} (only {len(Ys)} samples)")
            continue
        tree = DecisionTreeClassifier(
            max_depth=MAX_TREE_DEPTH,
            min_samples_leaf=MIN_SAMPLES_LEAF,
            random_state=0
        )
        tree.fit(Xs, Ys)
        models[cid] = tree
        print(f"  · cluster {cid}: trained tree ({len(Ys)} samples)")
        print(export_text(tree, feature_names=list(feats_by_inst[inst_col].columns)))

    # ─── 4) Inference on the 250‐bar test set ───────────────────────────────
    print("\n🔧 Running test‐set inference…")
    y_true, y_pred = [], []

    price_test_df = price_df.iloc[n_train:]
    ema_f_test = price_test_df.ewm(span=FAST_EMA, adjust=False).mean()
    ema_s_test = price_test_df.ewm(span=SLOW_EMA, adjust=False).mean()

    for inst_idx, inst_col in enumerate(price_df.columns):
        log_series = test_df[inst_idx]
        for t in range(WINDOW_SIZE, n_test-1):
            win = log_series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            lbl    = kmeans.predict([norm])[0]
            if (np.linalg.norm(norm - centers[lbl]) > CLUSTER_THRESHOLD
                    or lbl not in models):
                continue

            feats = feats_by_inst[inst_col].iloc[n_train + t - 1].values
            pred  = models[lbl].predict([feats])[0]
            actual= 1 if log_series[t] > log_series[t-1] else 0

            y_pred.append(pred)
            y_true.append(actual)
            print(f"[{inst_col} @ t={n_train+t}] Cluster {lbl} → "
                  f"Pred={'Up' if pred else 'Down'} / "
                  f"Actual={'Up' if actual else 'Down'}")

    # ─── 5) Summarize performance ───────────────────────────────────────────
    print("\n🎯 Final Test‐set Confusion Matrix")
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Down","Up"])
    disp.plot(cmap="Blues")
    plt.show()

    acc = np.mean(np.array(y_true) == np.array(y_pred))
    print(f"Overall accuracy: {acc:.2%} ({len(y_true)} calls)")


In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """Extract n_pips perceptually important points (PIPs) from a 1D series."""
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist, max_idx = -1.0, None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i 
                              - (x2 - x1) * data[i] 
                              + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den else 0
                else:
                    # vertical distance to the chord
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                if dist > max_dist:
                    max_dist, max_idx = dist, i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def compute_indicators(price: pd.Series) -> pd.DataFrame:
    """
    Build a wide DataFrame of technicals from a close‐price series.
    Drops lookbacks > 100, handles NaNs by back/forward‐fill then zero.
    """
    df = pd.DataFrame(index=price.index)
    # 1) SMAs & EMAs
    sma_ws = [10, 12, 20, 25, 26, 50, 100]
    ema_ws = [12, 26, 50]
    for w in sma_ws:
        df[f"SMA_{w}"] = price.rolling(w, min_periods=1).mean()
    for w in ema_ws:
        df[f"EMA_{w}"] = price.ewm(span=w, adjust=False).mean()

    # 2) Price–MA & MA–MA diffs
    for w in [12, 26, 50, 100]:
        df[f"PR_MA_{w}"] = price - df[f"SMA_{w}"]
    df["MA_DIFF_25_100"] = df["SMA_25"] - df["SMA_100"]
    df["MA_DIFF_12_26"]  = df["SMA_12"] - df["SMA_26"]

    # 3) RSI
    for w in [6, 9, 14, 21]:
        delta = price.diff()
        up    = delta.clip(lower=0).rolling(w, min_periods=1).mean()
        down  = (-delta.clip(upper=0)).rolling(w, min_periods=1).mean()
        rs    = up / down.replace(0, np.nan)
        df[f"RSI_{w}"] = 100 - 100 / (1 + rs)

    # 4) SLOPE (OLS) for windows up to 100
    def _slope(arr):
        t = np.arange(len(arr))
        m, _ = np.linalg.lstsq(np.vstack([t, np.ones_like(t)]).T,
                               arr, rcond=None)[0]
        return m
    for w in [10, 50, 75, 100]:
        df[f"SLOPE_{w}"] = price.rolling(w, min_periods=w).apply(
            _slope, raw=True)

    # 5) ROC, STD, Z, PCTL
    for w in [5, 10, 20, 50]:
        df[f"ROC_{w}"] = price.pct_change(w)
    for w in [10, 20, 50]:
        df[f"STD_{w}"] = price.rolling(w, min_periods=1).std()
    for w in [20, 50]:
        mu = price.rolling(w, min_periods=1).mean()
        sd = price.rolling(w, min_periods=1).std().replace(0, np.nan)
        df[f"Z_{w}"] = (price - mu) / sd
    for w in [10, 20, 50]:
        df[f"PCTL_{w}"] = price.rolling(w, min_periods=1) \
            .apply(lambda x: (x.iloc[-1] - x.min()) /
                             (x.max() - x.min()) if x.max()!=x.min() else 0,
                   raw=False)

    # 6) Up/Down streak
    dif = price.diff()
    streak = [0]
    for i in range(1, len(price)):
        if dif.iat[i] > 0 and dif.iat[i-1] > 0:
            streak.append(streak[-1] + 1)
        elif dif.iat[i] < 0 and dif.iat[i-1] < 0:
            streak.append(streak[-1] - 1)
        else:
            streak.append(1 if dif.iat[i] > 0 else -1 if dif.iat[i] < 0 else 0)
    df["STREAK"] = streak

    # 7) MACD
    macd_line   = df["EMA_12"] - df["EMA_26"]
    macd_sig    = macd_line.ewm(span=9, adjust=False).mean()
    df["MACD"]      = macd_line
    df["MACD_SIG"]  = macd_sig
    df["MACD_HIST"] = macd_line - macd_sig

    # 8) Pivots + broken pivot flags
    for w in [6, 10, 50, 100]:
        df[f"PIVOT_H_{w}"] = price.rolling(w, min_periods=1).max().shift(1)
        df[f"PIVOT_L_{w}"] = price.rolling(w, min_periods=1).min().shift(1)
        df[f"BRK_H_{w}"]   = (price > df[f"PIVOT_H_{w}"]).astype(int)
        df[f"BRK_L_{w}"]   = (price < df[f"PIVOT_L_{w}"]).astype(int)

    # Fill NaNs
    df = df.fillna(method="bfill").fillna(method="ffill").fillna(0)
    return df


if __name__ == '__main__':
    # ─── Parameters ──────────────────────────────────────────────────────────
    PRICE_FILE        = '../../prices.txt'
    WINDOW_SIZE       = 24
    N_PIPS            = 5
    DIST_MEASURE      = 2
    N_CLUSTERS        = 10
    CLUSTER_THRESHOLD = 0.30
    FAST_EMA, SLOW_EMA= 12, 26
    GBDT_PARAMS       = {
        "n_estimators": 200,
        "learning_rate": 0.05,
        "max_depth": 4,
        "subsample": 0.7,
        "random_state": 0
    }

    # ─── Load closes & compute indicators ────────────────────────────────────
    raw = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    raw.columns = [f"I{i}" for i in raw.columns]
    price_df = raw.copy()                     # shape (750, n_inst)
    print("Computed indicators… this may take 10–20s")
    feats_by_inst = {
        inst: compute_indicators(price_df[inst])
        for inst in price_df.columns
    }
    print("→ Done.\n")

    # ─── Log‐prices & train/test split ───────────────────────────────────────
    log_vals = np.log(price_df.values)
    n_total, n_inst = log_vals.shape
    n_train = 500
    train_vals = log_vals[:n_train]           # shape (500, inst)
    test_vals  = log_vals[n_train:]           # shape (250, inst)

    # transpose for easier indexing: shape → (inst, time)
    train_T = train_vals.T
    test_T  = test_vals.T

    # ─── 1) PIP clustering on train windows ─────────────────────────────────
    print("Clustering PIP-shapes on first 500 bars…")
    train_patterns = []
    for ii in range(n_inst):
        series = train_T[ii]
        for t in range(WINDOW_SIZE, n_train):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            train_patterns.append(norm)

    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0)
    kmeans.fit(train_patterns)
    centers = kmeans.cluster_centers_
    print("→ KMeans fitted on", len(train_patterns), "patterns\n")

    # plot cluster‐sizes
    ct = Counter(kmeans.labels_)
    plt.figure(figsize=(5,3))
    plt.bar(ct.keys(), ct.values(), alpha=0.7)
    plt.title("Train-set windows per cluster")
    plt.xlabel("Cluster"); plt.ylabel("Count")
    plt.tight_layout(); plt.show()

    # ─── 2) Build global train set (features + distances → next-bar label) ──
    print("Building global training set…")
    X_train, y_train = [], []
    for ii, inst in enumerate(price_df.columns):
        series = train_T[ii]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, n_train-1):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            # 1) cluster distances
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            dists  = np.linalg.norm(centers - norm, axis=1)

            # 2) indicators at the last in-window bar = t-1
            feats = feats_df.iloc[t-1].values

            # 3) label = next-bar up/down
            label = 1 if series[t] > series[t-1] else 0

            # 4) build row
            X_train.append(np.concatenate([feats, dists]))
            y_train.append(label)

    X_train = np.vstack(X_train)
    y_train = np.array(y_train)
    print(f"→ Training set size: {X_train.shape[0]} rows, {X_train.shape[1]} features\n")

    # ─── 3) Fit a global GradientBoostingClassifier ─────────────────────────
    print("Training GradientBoostingClassifier on global features…")
    model = GradientBoostingClassifier(**GBDT_PARAMS)
    model.fit(X_train, y_train)
    print("→ Done.\n")

    # feature importances
    fi = model.feature_importances_
    # split: first p indicators, last N_CLUSTERS distances
    n_feats = X_train.shape[1] - N_CLUSTERS
    feat_names = list(feats_by_inst[inst].columns) + [f"DIST_C{c}" for c in range(N_CLUSTERS)]
    imp_idx = np.argsort(fi)[::-1][:20]  # top 20
    plt.figure(figsize=(6,4))
    plt.bar([feat_names[i] for i in imp_idx], fi[imp_idx], color='C1')
    plt.xticks(rotation=60, ha='right')
    plt.title("Top-20 Feature Importances")
    plt.tight_layout(); plt.show()

    # ─── 4) Build and evaluate on test set ──────────────────────────────────
    print("Building test set & running inference…")
    X_test, y_test = [], []
    for ii, inst in enumerate(price_df.columns):
        series = test_T[ii]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, n_total-n_train-1):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            dists  = np.linalg.norm(centers - norm, axis=1)

            feats = feats_df.iloc[n_train + t -1].values
            label = 1 if series[t] > series[t-1] else 0

            X_test.append(np.concatenate([feats, dists]))
            y_test.append(label)

    X_test = np.vstack(X_test)
    y_test = np.array(y_test)
    print(f"→ Test set size: {X_test.shape[0]} rows\n")

    print("Running test predictions…")
    y_pred = model.predict(X_test)

    # confusion matrix
    cm = confusion_matrix(y_test, y_pred, normalize=None)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Down","Up"])
    disp.plot(cmap="Blues")
    plt.title("GBDT Global Model: Confusion Matrix")
    plt.show()

    acc = (y_pred == y_test).mean()
    print(f"Overall test accuracy: {acc:.2%}")


In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """Extract n_pips perceptually important points (PIPs) from a 1D series."""
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist, max_idx = -1.0, None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1+1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1+x2)/2)
                elif dist_measure == 2:
                    num = abs((y2-y1)*i - (x2-x1)*data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2-y1, x2-x1)
                    dist = num/den if den else 0
                else:
                    interp = y1 + (y2-y1)*(i-x1)/(x2-x1)
                    dist = abs(data[i] - interp)
                if dist > max_dist:
                    max_dist, max_idx = dist, i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def compute_indicators(price: pd.Series) -> pd.DataFrame:
    """
    Build technical indicators from close-only series.
    All lookbacks ≤100, NaNs backfilled then forward-filled then zeroed.
    """
    df = pd.DataFrame(index=price.index)
    # 1) SMAs & EMAs
    sma_ws = [10, 12, 20, 25, 26, 50, 100]
    ema_ws = [12, 26, 50]
    for w in sma_ws:
        df[f"SMA_{w}"] = price.rolling(w, min_periods=1).mean()
    for w in ema_ws:
        df[f"EMA_{w}"] = price.ewm(span=w, adjust=False).mean()

    # 2) Price-MA and MA-MA diffs
    for w in [12, 26, 50, 100]:
        df[f"PR_MA_{w}"] = price - df[f"SMA_{w}"]
    df["MA_DIFF_25_100"] = df["SMA_25"] - df["SMA_100"]
    df["MA_DIFF_12_26"]  = df["SMA_12"] - df["SMA_26"]

    # 3) RSI
    for w in [6, 9, 14, 21]:
        delta = price.diff()
        up    = delta.clip(lower=0).rolling(w, min_periods=1).mean()
        down  = (-delta.clip(upper=0)).rolling(w, min_periods=1).mean()
        rs    = up / down.replace(0, np.nan)
        df[f"RSI_{w}"] = 100 - 100/(1+rs)

    # 4) OLS slope
    def _slope(arr):
        t = np.arange(len(arr))
        m, _ = np.linalg.lstsq(np.vstack([t, np.ones_like(t)]).T, arr,
                               rcond=None)[0]
        return m
    for w in [10, 50, 75, 100]:
        df[f"SLOPE_{w}"] = price.rolling(w, min_periods=w).apply(_slope, raw=True)

    # 5) ROC, STD, Z, PCTL
    for w in [5, 10, 20, 50]:
        df[f"ROC_{w}"] = price.pct_change(w)
    for w in [10, 20, 50]:
        df[f"STD_{w}"] = price.rolling(w, min_periods=1).std()
    for w in [20, 50]:
        mu = price.rolling(w, min_periods=1).mean()
        sd = price.rolling(w, min_periods=1).std().replace(0, np.nan)
        df[f"Z_{w}"] = (price - mu)/sd
    for w in [10, 20, 50]:
        df[f"PCTL_{w}"] = price.rolling(w, min_periods=1) \
            .apply(lambda x: (x.iloc[-1]-x.min())/(x.max()-x.min())
                   if x.max()!=x.min() else 0, raw=False)

    # 6) Up/Down streak
    dif = price.diff()
    streak = [0]
    for i in range(1, len(price)):
        if dif.iat[i] > 0 and dif.iat[i-1] > 0:
            streak.append(streak[-1]+1)
        elif dif.iat[i] < 0 and dif.iat[i-1] < 0:
            streak.append(streak[-1]-1)
        else:
            streak.append(1 if dif.iat[i]>0 else -1 if dif.iat[i]<0 else 0)
    df["STREAK"] = streak

    # 7) MACD
    macd_line   = df["EMA_12"] - df["EMA_26"]
    macd_sig    = macd_line.ewm(span=9, adjust=False).mean()
    df["MACD"]      = macd_line
    df["MACD_SIG"]  = macd_sig
    df["MACD_HIST"] = macd_line - macd_sig

    # 8) Pivots & broken pivots
    for w in [6, 10, 50, 100]:
        df[f"PIVOT_H_{w}"] = price.rolling(w, min_periods=1).max().shift(1)
        df[f"PIVOT_L_{w}"] = price.rolling(w, min_periods=1).min().shift(1)
        df[f"BRK_H_{w}"]   = (price > df[f"PIVOT_H_{w}"]).astype(int)
        df[f"BRK_L_{w}"]   = (price < df[f"PIVOT_L_{w}"]).astype(int)

    return df.fillna(method="bfill").fillna(method="ffill").fillna(0)


if __name__ == '__main__':
    # ─── Parameters ──────────────────────────────────────────────────────────
    PRICE_FILE        = '../../prices.txt'
    WINDOW_SIZE       = 36
    N_PIPS            = 7
    DIST_MEASURE      = 2
    N_CLUSTERS        = 12
    CLUSTER_THRESHOLD = 0.30
    GBDT_PARAMS = {
        "n_estimators": 200,
        "learning_rate": 0.05,
        "max_depth": 4,
        "subsample": 0.7,
        "random_state": 0
    }
    MIN_SAMPLES = 50   # minimum training rows per cluster to build a model

    # ─── 1) Load data & compute indicators ───────────────────────────────────
    raw = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    raw.columns = [f"I{i}" for i in raw.columns]
    price_df = raw.copy()
    print("💡 Computing indicators for each instrument…")
    feats_by_inst = {inst: compute_indicators(price_df[inst])
                     for inst in price_df.columns}
    print("→ Done.\n")

    # ─── 2) Log‐prices & split train/test ───────────────────────────────────
    log_vals = np.log(price_df.values)
    T, n_inst = log_vals.shape
    n_train = 500
    train_T = log_vals[:n_train].T   # shape (n_inst, 500)
    test_T  = log_vals[n_train:].T   # shape (n_inst, 250)

    # ─── 3) Cluster PIP‐patterns on training set ───────────────────────────
    train_patterns = []
    for ii in range(n_inst):
        series = train_T[ii]
        for t in range(WINDOW_SIZE, n_train):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals-mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            train_patterns.append(norm)

    print(f"🔧 Clustering {len(train_patterns)} training windows into {N_CLUSTERS} clusters…")
    kmeans  = KMeans(n_clusters=N_CLUSTERS, random_state=0)
    kmeans.fit(train_patterns)
    centers = kmeans.cluster_centers_
    print("→ KMeans complete.\n")

    # plot cluster sizes
    ct = Counter(kmeans.labels_)
    plt.figure(figsize=(5,3))
    plt.bar(ct.keys(), ct.values(), alpha=0.7)
    plt.title("Train‐set windows per cluster")
    plt.xlabel("Cluster"); plt.ylabel("Count")
    plt.tight_layout(); plt.show()

    # ─── 4) Build per‐cluster training sets ────────────────────────────────
    X_train_by_c = defaultdict(list)
    y_train_by_c = defaultdict(list)

    print("🔧 Building per-cluster training sets…")
    for ii, inst in enumerate(price_df.columns):
        series = train_T[ii]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, n_train-1):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            # assign cluster on first N_PIPS-1 PIPs
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals-mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            cid    = int(kmeans.predict([norm])[0])
            if np.linalg.norm(norm - centers[cid]) > CLUSTER_THRESHOLD:
                continue

            # features at bar t-1
            feats = feats_df.iloc[t-1].values

            # label next-bar Up/Down
            label = 1 if series[t] > series[t-1] else 0

            X_train_by_c[cid].append(feats)
            y_train_by_c[cid].append(label)

    # report cluster training sizes
    for cid in range(N_CLUSTERS):
        print(f" Cluster {cid:>2d}: {len(y_train_by_c[cid])} samples")
    print()

    # ─── 5) Train one GBDT per cluster ────────────────────────────────────
    cluster_models = {}
    print("🔧 Training one model per cluster…")
    for cid in range(N_CLUSTERS):
        Xs = X_train_by_c[cid]
        Ys = y_train_by_c[cid]
        if len(Ys) < MIN_SAMPLES:
            print(f"  · skipping cluster {cid} ({len(Ys)} < {MIN_SAMPLES})")
            continue
        model = GradientBoostingClassifier(**GBDT_PARAMS)
        model.fit(Xs, Ys)
        cluster_models[cid] = model
        print(f"  · trained cluster {cid} model on {len(Ys)} samples")

    # ─── 6) Inference on test set ─────────────────────────────────────────
    print("\n🔧 Running test‐set inference…")
    y_true, y_pred = [], []
    per_cluster_perf = defaultdict(lambda: [0,0])  # correct, total

    for ii, inst in enumerate(price_df.columns):
        series = test_T[ii]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, test_T.shape[1]-1):
            win  = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue

            # cluster assignment
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm   = (vals-mn)/(mx-mn) if mx!=mn else np.zeros_like(vals)
            cid    = int(kmeans.predict([norm])[0])
            if (cid not in cluster_models or
                np.linalg.norm(norm - centers[cid]) > CLUSTER_THRESHOLD):
                continue

            feats = feats_df.iloc[n_train + t - 1].values
            pred  = cluster_models[cid].predict([feats])[0]
            actual= 1 if series[t] > series[t-1] else 0

            y_true.append(actual)
            y_pred.append(pred)
            per_cluster_perf[cid][1] += 1
            per_cluster_perf[cid][0] += (pred == actual)

            print(f"[{inst} @ t={n_train + t}] Cluster {cid} → "
                  f"Pred={'Up' if pred else 'Down'} / "
                  f"Actual={'Up' if actual else 'Down'}")

    # ─── 7) Summaries & plots ─────────────────────────────────────────────
    print("\n🎯 Global confusion matrix:")
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Down","Up"])
    disp.plot(cmap="Blues")
    plt.show()
    acc = np.mean(np.array(y_true)==np.array(y_pred))
    print(f"Overall accuracy: {acc:.2%} ({len(y_true)} predictions)\n")

    print("Per-cluster accuracy:")
    for cid in sorted(per_cluster_perf):
        corr, tot = per_cluster_perf[cid]
        print(f" Cluster {cid}: {corr}/{tot} = {corr/tot:.2%}")


In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from catboost import CatBoostClassifier


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist, max_idx = -1.0, None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1+1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den else 0
                else:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                if dist > max_dist:
                    max_dist, max_idx = dist, i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def compute_indicators(price: pd.Series) -> pd.DataFrame:
    df = pd.DataFrame(index=price.index)
    # SMAs & EMAs
    sma_ws = [10, 20, 50, 100]
    ema_ws = [12, 26, 50, 200]
    for w in sma_ws:
        df[f"SMA_{w}"] = price.rolling(w, min_periods=1).mean()
    for w in ema_ws:
        df[f"EMA_{w}"] = price.ewm(span=w, adjust=False).mean()
    # RSI
    for w in [6, 14, 21]:
        delta = price.diff()
        up = delta.clip(lower=0).rolling(w, min_periods=1).mean()
        down = (-delta.clip(upper=0)).rolling(w, min_periods=1).mean()
        rs = up / down.replace(0, np.nan)
        df[f"RSI_{w}"] = 100 - 100/(1+rs)
    # ROC, STD, Z
    for w in [5, 10, 20, 50]:
        df[f"ROC_{w}"] = price.pct_change(w)
        df[f"STD_{w}"] = price.rolling(w, min_periods=1).std()
    for w in [20, 50]:
        mu = price.rolling(w, min_periods=1).mean()
        sd = price.rolling(w, min_periods=1).std().replace(0, np.nan)
        df[f"Z_{w}"] = (price - mu) / sd
    # MACD
    df["MACD"] = df["EMA_12"] - df["EMA_26"]
    df["MACD_SIG"] = df["MACD"].ewm(span=9, adjust=False).mean()
    df["MACD_HIST"] = df["MACD"] - df["MACD_SIG"]
    return df.fillna(method='bfill').fillna(method='ffill').fillna(0)

if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'
    WINDOW_SIZE = 24
    N_PIPS = 5
    DIST_MEASURE = 2
    N_CLUSTERS = 10
    CLUSTER_THRESHOLD = 0.3

    raw = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    raw.columns = [f"I{c}" for c in raw.columns]
    price_df = raw.copy()

    # Compute indicators
    print("Computing indicators for all instruments...")
    feats_by_inst = {inst: compute_indicators(price_df[inst]) for inst in price_df.columns}
    print("Done computing indicators.\n")

    # Log-transform and split
    log_vals = np.log(price_df.values)
    n_train = 500
    train_T = log_vals[:n_train].T
    test_T = log_vals[n_train:].T
    n_inst = train_T.shape[0]

    # 1) Cluster PIP patterns in training set
    print("Clustering PIP patterns on training data...")
    train_patterns = []
    for i in range(n_inst):
        series = train_T[i]
        for t in range(WINDOW_SIZE, n_train):
            win = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            train_patterns.append(norm)
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(train_patterns)
    centers = kmeans.cluster_centers_
    print("Done clustering.\n")

    # 2) Build per-cluster training sets
    X_train_by_c = defaultdict(list)
    y_train_by_c = defaultdict(list)
    print("Building per-cluster training datasets...")
    for i, inst in enumerate(price_df.columns):
        series = train_T[i]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, n_train-1):
            win = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            cid = int(kmeans.predict([norm])[0])
            if np.linalg.norm(norm - centers[cid]) > CLUSTER_THRESHOLD:
                continue
            feats = feats_df.iloc[t-1].values
            label = 1 if series[t] > series[t-1] else 0
            X_train_by_c[cid].append(feats)
            y_train_by_c[cid].append(label)
    for cid in range(N_CLUSTERS):
        print(f"Cluster {cid}: {len(y_train_by_c[cid])} training samples")
    print()

    # 3) Train CatBoost per cluster
    print("Training CatBoost models per cluster...")
    cluster_models = {}
    for cid in range(N_CLUSTERS):
        Xs = X_train_by_c[cid]
        Ys = y_train_by_c[cid]
        if len(Ys) < 100:
            print(f"Skipping cluster {cid}, only {len(Ys)} samples")
            continue
        model = CatBoostClassifier(
            iterations=200,
            learning_rate=0.05,
            depth=4,
            loss_function='Logloss',
            verbose=False
        )
        model.fit(Xs, Ys)
        cluster_models[cid] = model
        print(f"Trained CatBoost for cluster {cid} ({len(Ys)} samples)")
    print()

    # 4) Inference on test set
    y_true, y_pred = [], []
    perf_by_c = defaultdict(lambda: [0,0])  # correct, total
    print("Running per-cluster inference on test data...")
    for i, inst in enumerate(price_df.columns):
        series = test_T[i]
        feats_df = feats_by_inst[inst]
        for t in range(WINDOW_SIZE, test_T.shape[1]-1):
            win = series[t-WINDOW_SIZE:t]
            pips = find_pips(win, N_PIPS, DIST_MEASURE)
            if len(pips) < N_PIPS:
                continue
            vals = win[pips[:-1]]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            cid = int(kmeans.predict([norm])[0])
            if cid not in cluster_models:
                continue
            feats = feats_df.iloc[n_train + t - 1].values
            pred = cluster_models[cid].predict([feats])[0]
            actual = 1 if series[t] > series[t-1] else 0
            y_true.append(actual)
            y_pred.append(pred)
            perf_by_c[cid][1] += 1
            perf_by_c[cid][0] += (pred == actual)
            print(f"Inst {inst} t={n_train+t}, Clust {cid}, Pred={'Up' if pred else 'Down'}, Actual={'Up' if actual else 'Down'}")

    # 5) Global confusion
    print("\nGlobal confusion matrix:")
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Down","Up"])
    disp.plot(cmap='Blues')
    plt.show()
    acc = np.mean(np.array(y_true) == np.array(y_pred))
    print(f"Overall accuracy: {acc:.2%} ({len(y_true)} predictions)\n")

    print("Per-cluster accuracy:")
    for cid, (corr, tot) in perf_by_c.items():
        print(f"Cluster {cid}: {corr}/{tot} = {corr/tot:.2%}")


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance),
                  2=Perpendicular distance to chord,
                  3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i -
                              (x2 - x1) * data[i] +
                              x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int):
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # ─── Configuration ───────────────────────────────────────────────────────
    PRICE_FILE        = '../../prices.txt'
    WINDOW_SIZE       = 5
    N_PIPS            = 3
    DIST_MEASURE      = 2
    N_CLUSTERS        = 8
    CLUSTER_THRESHOLD = 0.35

    # ─── Load & pre-process ──────────────────────────────────────────────────
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)
    n_inst, n_t = log_prices.shape

    n_train    = 500
    train_data = log_prices[:, :n_train]
    test_data  = log_prices[:, n_train:]
    _, n_test  = test_data.shape

    # ─── 1) Train clusters on training data ──────────────────────────────────
    train_patterns = extract_pip_patterns(train_data, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    train_vecs     = list(train_patterns.keys())
    kmeans, _, _   = cluster_patterns(train_vecs, N_CLUSTERS)
    centers        = kmeans.cluster_centers_

    # ─── 2) Identify cluster occurrences on test set per instrument ────────
    matched_ends = {inst: {cid: [] for cid in range(N_CLUSTERS)} for inst in range(n_inst)}
    cluster_counts_test   = Counter()
    cluster_patterns_test = {cid: [] for cid in range(N_CLUSTERS)}

    for inst in range(n_inst):
        series = test_data[inst]
        for end in range(WINDOW_SIZE, n_test):
            window   = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue

            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)

            lbl  = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            cluster_counts_test[lbl] += 1
            cluster_patterns_test[lbl].append(tuple(np.round(norm, 3)))
            matched_ends[inst][lbl].append(end)

    # ─── 3) Plot price windows grouped by cluster first, then instrument ─────
    x_idx = np.arange(n_train, n_train + n_test)
    for cid in range(N_CLUSTERS):
        for inst in range(n_inst):
            ends = matched_ends[inst][cid]
            if not ends:
                continue
            plt.figure(figsize=(12, 4))
            prices = np.exp(test_data[inst])
            plt.plot(x_idx, prices, linewidth=1.5, label=f'Instrument {inst} Price')
            for end in ends:
                start = n_train + end - WINDOW_SIZE
                stop  = n_train + end
                plt.axvspan(start, stop, alpha=0.3,
                            label=f'Cluster {cid} Window' if end == ends[0] else None)
            plt.title(f'Cluster {cid} - Instrument {inst} Occurrences (n={len(ends)})')
            plt.xlabel('Time Index')
            plt.ylabel('Price')
            plt.legend(loc='upper left')
            plt.tight_layout()
            plt.show()

    # ─── 4) Print test-set cluster frequencies ───────────────────────────────
    print("\nTest Set: Cluster ID → Accumulated Frequency")
    for cid, freq in cluster_counts_test.most_common():
        print(f"Cluster {cid}: {freq}")

    # ─── 5) Plot each test-set cluster patterns + centroid ─────────────────
    x = np.arange(N_PIPS)
    for cid, pats in cluster_patterns_test.items():
        if not pats:
            continue
        plt.figure(figsize=(6, 4))
        for pat in pats:
            plt.plot(x, pat, alpha=0.1)
        plt.plot(x, centers[cid], linewidth=2, label=f'Centroid {cid}')
        plt.title(f'Test Set Cluster {cid} (n={cluster_counts_test[cid]})')
        plt.xlabel('PIP Index')
        plt.ylabel('Normalized Value')
        plt.legend()
        plt.tight_layout()
        plt.show()
