In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def find_pips(data: np.array, n_pips: int, dist_measure: int):
    """
    Extract n_pips perceptually important points from a 1D series.
    dist_measure: 1=Euclidean, 2=Perpendicular, 3=Vertical.
    """
    pips_x = [0, len(data) - 1]
    pips_y = [data[0], data[-1]]

    for curr_point in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        insert_at = None

        # ensure sorted pips_x
        sorted_indices = sorted(range(len(pips_x)), key=lambda i: pips_x[i])
        for k in range(len(sorted_indices) - 1):
            left = sorted_indices[k]
            right = sorted_indices[k + 1]
            x1 = pips_x[left]
            y1 = pips_y[left]
            x2 = pips_x[right]
            y2 = pips_y[right]
            dx = x2 - x1
            dy = y2 - y1
            slope = dy / dx if dx != 0 else 0.0
            intercept = y1 - slope * x1

            for i in range(x1 + 1, x2):
                if i in pips_x:
                    continue
                y0 = data[i]
                if dist_measure == 1:
                    d = np.hypot(i - x1, y0 - y1) + np.hypot(i - x2, y0 - y2)
                elif dist_measure == 2:
                    d = abs(slope * i + intercept - y0) / np.hypot(slope, 1)
                else:
                    d = abs(slope * i + intercept - y0)

                if d > max_dist:
                    max_dist = d
                    max_idx  = i
                    insert_at = right

        if max_idx is not None:
            pips_x.insert(insert_at, max_idx)
            pips_y.insert(insert_at, data[max_idx])
        else:
            break

    paired = sorted(zip(pips_x, pips_y), key=lambda xy: xy[0])
    xs, ys = zip(*paired)
    return list(xs), list(ys)

'''
# ─── Load close-only price data ─────────────────────────────────────────────
prices = pd.read_csv(
    '../../prices.txt',
    delim_whitespace=True,
    header=None
)

# Parameters
N_PIPS = 5
DIST_MEASURE = 24
LAST_N = 50

# ─── Compute and plot for each instrument, slicing last LAST_N points ─────
for inst in prices.columns[:3]:
    series = prices[inst].values
    # take last LAST_N points
    slice_start = max(0, len(series) - LAST_N)
    slice_data = series[slice_start:]
    xs, ys = find_pips(slice_data, N_PIPS, DIST_MEASURE)
    # convert xs back to global indices if needed:
    global_xs = [slice_start + x for x in xs]

    plt.figure(figsize=(8, 3))
    plt.plot(range(slice_start, len(series)), slice_data, color="tab:blue", label=f"Inst {inst}")
    plt.scatter(global_xs, ys, color="tab:red", s=40, label="PIPs")
    plt.title(f"Instrument {inst}: Last {LAST_N} Points with PIPs")
    plt.xlabel("Time index")
    plt.ylabel("Price")
    plt.legend()
    plt.tight_layout()
    plt.show()
'''


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    # Always include first and last points
    pips_x = [0, len(data) - 1]
    # Iteratively add PIPs
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        # examine each segment
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    # perpendicular distance
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Slide a window of length window_size over each instrument's log-price series,
    extract PIP indices, normalize their values, and count pattern frequencies.

    log_data: array shape (n_instruments, n_timesteps)
    Returns Counter mapping normalized-PIP-value-tuples to counts.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            if max_v - min_v != 0:
                norm = (vals - min_v) / (max_v - min_v)
            else:
                norm = np.zeros_like(vals)
            pattern = tuple(np.round(norm, 3))
            patterns[pattern] += 1
    return patterns


def cluster_patterns(pattern_counts: Counter,
                     n_clusters: int,
                     threshold: float) -> Counter:
    """
    Cluster the normalized PIP patterns and accumulate counts for patterns
    whose distance to their cluster center is below threshold.

    pattern_counts: Counter mapping pattern tuples to frequencies
    n_clusters: number of clusters for KMeans
    threshold: max distance to centroid to include in accumulation
    Returns Counter mapping cluster_id to total frequency.
    """
    patterns = list(pattern_counts.keys())
    freqs = list(pattern_counts.values())
    data = np.array(patterns)

    # Run KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    centers = kmeans.cluster_centers_

    # Accumulate counts for close matches
    cluster_counts = Counter()
    for label, vec, cnt in zip(labels, data, freqs):
        dist = np.linalg.norm(vec - centers[label])
        if dist <= threshold:
            cluster_counts[label] += cnt
    return cluster_counts


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'  # whitespace-delimited txt with 750 rows × 50 cols
    WINDOW_SIZE = 100                # sliding window length
    N_PIPS = 8                       # number of PIPs per window
    DIST_MEASURE = 2                 # distance measure for PIPs
    N_CLUSTERS = 10                 # number of clusters to form
    CLUSTER_THRESHOLD = 0.5         # distance threshold for cluster matching

    # Load price data (50 instruments × 750 timesteps)
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")

    # Convert to log-prices
    log_prices = np.log(df.values.T)

    # Extract PIP patterns
    pattern_counts = extract_pip_patterns(log_prices, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    print(f"Found {len(pattern_counts)} unique PIP patterns.")

    # Cluster and accumulate
    cluster_counts = cluster_patterns(pattern_counts, N_CLUSTERS, CLUSTER_THRESHOLD)
    print("\nCluster ID -> Accumulated Frequency (within threshold)")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pattern = tuple(np.round(norm, 3))
            patterns[pattern] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 100                    # sliding window length
    N_PIPS = 8                           # number of PIPs per window
    DIST_MEASURE = 2                     # distance measure for PIPs
    N_CLUSTERS = 10                      # number of clusters to form
    CLUSTER_THRESHOLD = 0.5              # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # 1) Extract PIP patterns
    pattern_counts = extract_pip_patterns(log_prices, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    patterns = list(pattern_counts.keys())
    freqs = list(pattern_counts.values())
    print(f"Found {len(patterns)} unique PIP patterns.")

    # 2) Cluster patterns
    kmeans, data, labels = cluster_patterns(patterns, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 3) Accumulate cluster frequencies (within threshold)
    cluster_counts = Counter()
    pattern_to_cluster = {}
    for pat, vec, lbl, cnt in zip(patterns, data, labels, freqs):
        dist = np.linalg.norm(vec - centers[lbl])
        if dist <= CLUSTER_THRESHOLD:
            cluster_counts[lbl] += cnt
            pattern_to_cluster[pat] = lbl

    print("\nCluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    # 4) Evaluate within-cluster PIP-based next move
    move_counts = {cid: {'Up': 0, 'Down': 0} for cid in cluster_counts}
    n_inst, n_t = log_prices.shape
    for inst in range(n_inst):
        series = log_prices[inst]
        for end in range(WINDOW_SIZE, n_t):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            # last two PIP indices (relative to window start)
            idx_prev, idx_last = pips_idx[-2], pips_idx[-1]
            val_prev = window[idx_prev]
            val_last = window[idx_last]
            # normalize pattern
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pat = tuple(np.round(norm, 3))
            if pat in pattern_to_cluster:
                cid = pattern_to_cluster[pat]
                move = 'Up' if val_last - val_prev > 0 else 'Down'
                move_counts[cid][move] += 1

    print("\nCluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts.items():
        print(f"Cluster {cid}: Up = {counts['Up']}, Down = {counts['Down']}")


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pattern = tuple(np.round(norm, 3))
            patterns[pattern] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 12                    # sliding window length
    N_PIPS = 3                           # number of PIPs per window
    DIST_MEASURE = 2                     # distance measure for PIPs
    N_CLUSTERS = 10                      # number of clusters to form
    CLUSTER_THRESHOLD = 0.9              # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # 1) Extract PIP patterns
    pattern_counts = extract_pip_patterns(log_prices, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    patterns = list(pattern_counts.keys())
    freqs = list(pattern_counts.values())
    print(f"Found {len(patterns)} unique PIP patterns.")

    # 2) Cluster patterns
    kmeans, data, labels = cluster_patterns(patterns, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 3) Accumulate cluster frequencies (within threshold)
    cluster_counts = Counter()
    pattern_to_cluster = {}
    for pat, vec, lbl, cnt in zip(patterns, data, labels, freqs):
        dist = np.linalg.norm(vec - centers[lbl])
        if dist <= CLUSTER_THRESHOLD:
            cluster_counts[lbl] += cnt
            pattern_to_cluster[pat] = lbl

    print("\nCluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    # 4) Evaluate within-cluster PIP-based last move
    move_counts = {cid: {'Up': 0, 'Down': 0} for cid in cluster_counts}
    n_inst, n_t = log_prices.shape
    for inst in range(n_inst):
        series = log_prices[inst]
        for end in range(WINDOW_SIZE, n_t):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            idx_prev, idx_last = pips_idx[-2], pips_idx[-1]
            val_prev, val_last = window[idx_prev], window[idx_last]
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pat = tuple(np.round(norm, 3))
            if pat in pattern_to_cluster:
                cid = pattern_to_cluster[pat]
                move = 'Up' if val_last - val_prev > 0 else 'Down'
                move_counts[cid][move] += 1

    print("\nCluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts.items():
        print(f"Cluster {cid}: Up = {counts['Up']}, Down = {counts['Down']}")

    # 5) Overlay clusters on a single plot
    x = np.arange(N_PIPS)
    plt.figure(figsize=(10, 6))
    for cid in range(N_CLUSTERS):
        first = True
        for vec, lbl in zip(data, labels):
            if lbl == cid:
                if first:
                    plt.plot(x, vec, label=f"Cluster {cid}", alpha=0.3)
                    first = False
                else:
                    plt.plot(x, vec, alpha=0.1)
    plt.title('Normalized PIP Patterns by Cluster')
    plt.xlabel('PIP Index')
    plt.ylabel('Normalized Value')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pattern = tuple(np.round(norm, 3))
            patterns[pattern] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 50                    # sliding window length
    N_PIPS = 5                          # number of PIPs per window
    DIST_MEASURE = 2                     # distance measure for PIPs
    N_CLUSTERS = 10                      # number of clusters to form
    CLUSTER_THRESHOLD = 0.95              # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # 1) Extract PIP patterns
    pattern_counts = extract_pip_patterns(log_prices, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    patterns = list(pattern_counts.keys())
    freqs = list(pattern_counts.values())
    print(f"Found {len(patterns)} unique PIP patterns.")

    # 2) Cluster patterns
    kmeans, data, labels = cluster_patterns(patterns, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 3) Accumulate cluster frequencies (within threshold)
    cluster_counts = Counter()
    pattern_to_cluster = {}
    for pat, vec, lbl, cnt in zip(patterns, data, labels, freqs):
        dist = np.linalg.norm(vec - centers[lbl])
        if dist <= CLUSTER_THRESHOLD:
            cluster_counts[lbl] += cnt
            pattern_to_cluster[pat] = lbl

    print("\nCluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    # 4) Evaluate within-cluster PIP-based last move
    move_counts = {cid: {'Up': 0, 'Down': 0} for cid in cluster_counts}
    n_inst, n_t = log_prices.shape
    for inst in range(n_inst):
        series = log_prices[inst]
        for end in range(WINDOW_SIZE, n_t):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            idx_prev, idx_last = pips_idx[-2], pips_idx[-1]
            val_prev, val_last = window[idx_prev], window[idx_last]
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v - min_v != 0 else np.zeros_like(vals)
            pat = tuple(np.round(norm, 3))
            if pat in pattern_to_cluster:
                cid = pattern_to_cluster[pat]
                move = 'Up' if val_last - val_prev > 0 else 'Down'
                move_counts[cid][move] += 1

    print("\nCluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts.items():
        print(f"Cluster {cid}: Up = {counts['Up']}, Down = {counts['Down']}")

    # 5) Plot each cluster on its own figure
x = np.arange(N_PIPS)
for cid in range(N_CLUSTERS):
    plt.figure(figsize=(6, 4))
    any_plotted = False
    for vec, lbl in zip(data, labels):
        if lbl == cid:
            plt.plot(x, vec, alpha=0.3)
            any_plotted = True
    if not any_plotted:
        continue
    # overlay cluster centroid prominently
    plt.plot(x, centers[cid], label=f"Centroid {cid}", linewidth=2)
    plt.title(f'Cluster {cid} (n={cluster_counts.get(cid, 0)})')
    plt.xlabel('PIP Index')
    plt.ylabel('Normalized Value')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 24                    # sliding window length
    N_PIPS = 5                          # number of PIPs per window
    DIST_MEASURE = 2                    # distance measure for PIPs
    N_CLUSTERS = 10                     # number of clusters to form
    CLUSTER_THRESHOLD = 0.95            # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # Split train/test
    n_train = 500
    train_data = log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    n_inst, n_test = test_data.shape

    # 1) Train clusters on first 500 timesteps
    train_patterns = extract_pip_patterns(train_data, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    train_vecs = list(train_patterns.keys())
    train_freqs = list(train_patterns.values())
    kmeans, train_array, train_labels = cluster_patterns(train_vecs, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 2) Evaluate test set: assign and accumulate
    cluster_counts_test = Counter()
    cluster_patterns_test = {cid: [] for cid in range(N_CLUSTERS)}
    move_counts_test = Counter({cid: {'Up': 0, 'Down': 0} for cid in range(N_CLUSTERS)})
    for inst in range(n_inst):
        series = test_data[inst]
        for end in range(WINDOW_SIZE, n_test):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            lbl = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue
            cluster_counts_test[lbl] += 1
            cluster_patterns_test[lbl].append(tuple(np.round(norm, 3)))
            move = 'Up' if vals[-1] - vals[-2] > 0 else 'Down'
            move_counts_test[lbl][move] += 1

    # 3) Print test results
    print("\nTest Set: Cluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts_test.most_common():
        print(f"Cluster {cid}: {freq}")
    print("\nTest Set: Cluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts_test.items():
        up, down = counts['Up'], counts['Down']
        if up + down > 0:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # 4) Plot each test-set cluster overlayed patterns
    x = np.arange(N_PIPS)
    for cid, pats in cluster_patterns_test.items():
        if not pats:
            continue
        plt.figure(figsize=(6, 4))
        for pat in pats:
            plt.plot(x, pat, alpha=0.1)
        plt.plot(x, centers[cid], color='black', linewidth=2, label=f'Centroid {cid}')
        plt.title(f'Test Set Cluster {cid} (n={cluster_counts_test[cid]})')
        plt.xlabel('PIP Index')
        plt.ylabel('Normalized Value')
        plt.legend()
        plt.tight_layout()
        plt.show()

In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 24                    # sliding window length
    N_PIPS = 5                          # number of PIPs per window
    DIST_MEASURE = 2                    # distance measure for PIPs
    N_CLUSTERS = 10                     # number of clusters to form
    CLUSTER_THRESHOLD = 0.95            # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # Split train/test
    n_train = 500
    train_data = log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    n_inst, n_test = test_data.shape

    # 1) Train clusters on first 500 timesteps
    train_patterns = extract_pip_patterns(train_data, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    train_vecs = list(train_patterns.keys())
    kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 2) Evaluate test set: assign, accumulate, and record matches for instrument 0
    matched_ends_inst0 = []
    cluster_counts_test = Counter()
    cluster_patterns_test = {cid: [] for cid in range(N_CLUSTERS)}
    move_counts_test = Counter({cid: {'Up': 0, 'Down': 0} for cid in range(N_CLUSTERS)})

    for inst in range(n_inst):
        series = test_data[inst]
        for end in range(WINDOW_SIZE, n_test):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            lbl = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue
            cluster_counts_test[lbl] += 1
            cluster_patterns_test[lbl].append(tuple(np.round(norm, 3)))
            move = 'Up' if vals[-1] - vals[-2] > 0 else 'Down'
            move_counts_test[lbl][move] += 1
            if inst == 0:
                matched_ends_inst0.append(end)

    # 3) Print test results
    print("\nTest Set: Cluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts_test.most_common():
        print(f"Cluster {cid}: {freq}")
    print("\nTest Set: Cluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts_test.items():
        up, down = counts['Up'], counts['Down']
        if up + down > 0:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # 4) Plot each test-set cluster overlayed patterns
    x = np.arange(N_PIPS)
    for cid, pats in cluster_patterns_test.items():
        if not pats:
            continue
        plt.figure(figsize=(6, 4))
        for pat in pats:
            plt.plot(x, pat, alpha=0.1)
        plt.plot(x, centers[cid], color='black', linewidth=2, label=f'Centroid {cid}')
        plt.title(f'Test Set Cluster {cid} (n={cluster_counts_test[cid]})')
        plt.xlabel('PIP Index')
        plt.ylabel('Normalized Value')
        plt.legend()
        plt.tight_layout()
        plt.show()

    # 5) Plot instrument 0 price with shaded pattern windows
    plt.figure(figsize=(12, 4))
    times = np.arange(n_train, n_train + n_test)
    prices0 = np.exp(test_data[0])
    plt.plot(times, prices0, label='Instrument 0 Price')
    for end in matched_ends_inst0:
        start = n_train + end - WINDOW_SIZE
        stop = n_train + end
        plt.axvspan(start, stop, color='orange', alpha=0.3)
    plt.title('Instrument 0 Test-Set Price with Pattern Matches')
    plt.xlabel('Time Index')
    plt.ylabel('Price')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 24                    # sliding window length
    N_PIPS = 5                          # number of PIPs per window
    DIST_MEASURE = 2                    # distance measure for PIPs
    N_CLUSTERS = 10                     # number of clusters to form
    CLUSTER_THRESHOLD = 0.3            # max distance to centroid to include in accumulation

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    if df.shape[1] < 50:
        raise ValueError(f"Expected at least 50 instruments, found {df.shape[1]}")
    log_prices = np.log(df.values.T)     # shape (50, 750)

    # Split train/test
    n_train = 500
    train_data = log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    n_inst, n_test = test_data.shape

    # 1) Train clusters on first 500 timesteps
    train_patterns = extract_pip_patterns(train_data, WINDOW_SIZE, N_PIPS, DIST_MEASURE)
    train_vecs = list(train_patterns.keys())
    kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # 2) Evaluate test set: assign, accumulate, and record matches for instrument 0
    matched_ends_inst0 = []
    cluster_counts_test = Counter()
    cluster_patterns_test = {cid: [] for cid in range(N_CLUSTERS)}
    move_counts_test = Counter({cid: {'Up': 0, 'Down': 0} for cid in range(N_CLUSTERS)})

    for inst in range(n_inst):
        series = test_data[inst]
        for end in range(WINDOW_SIZE, n_test):
            window = series[end - WINDOW_SIZE:end]
            pips_idx = find_pips(window, N_PIPS, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            lbl = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue
            cluster_counts_test[lbl] += 1
            cluster_patterns_test[lbl].append(tuple(np.round(norm, 3)))
            move = 'Up' if vals[-1] - vals[-2] > 0 else 'Down'
            move_counts_test[lbl][move] += 1
            if inst == 0:
                matched_ends_inst0.append(end)

    # 3) Print test results
    print("\nTest Set: Cluster ID -> Accumulated Frequency")
    for cid, freq in cluster_counts_test.most_common():
        print(f"Cluster {cid}: {freq}")
    print("\nTest Set: Cluster ID -> Last PIP Move Distribution")
    for cid, counts in move_counts_test.items():
        up, down = counts['Up'], counts['Down']
        if up + down > 0:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # 4) Plot each test-set cluster overlayed patterns
    x = np.arange(N_PIPS)
    for cid, pats in cluster_patterns_test.items():
        if not pats:
            continue
        plt.figure(figsize=(6, 4))
        for pat in pats:
            plt.plot(x, pat, alpha=0.1)
        plt.plot(x, centers[cid], color='black', linewidth=2, label=f'Centroid {cid}')
        plt.title(f'Test Set Cluster {cid} (n={cluster_counts_test[cid]})')
        plt.xlabel('PIP Index')
        plt.ylabel('Normalized Value')
        plt.legend()
        plt.tight_layout()
        plt.show()

# --- New cell: custom range plot ---
# 6) Plot instrument 0 price over a specific timestep range and shade matching windows
custom_start = 500
custom_end   = 520

# full time index and price for instrument 0
times_full      = np.arange(n_train + n_test)
prices0_full    = np.exp(log_prices[0])  # back to price scale

# select the slice
mask = (times_full >= custom_start) & (times_full <= custom_end)
plt.figure(figsize=(10, 4))
plt.plot(times_full[mask], prices0_full[mask], label=f'Price [{custom_start}:{custom_end}]')
# shade only those pattern windows that intersect this slice
for end in matched_ends_inst0:
    start_idx = n_train + end - WINDOW_SIZE
    stop_idx  = n_train + end
    # check for overlap with custom slice
    if stop_idx < custom_start or start_idx > custom_end:
        continue
    span_start = max(start_idx, custom_start)
    span_stop  = min(stop_idx,  custom_end)
    plt.axvspan(span_start, span_stop, color='orange', alpha=0.3)

plt.title(f'Instrument 0 Price and Shaded Pattern Matches (t={custom_start}..{custom_end})')
plt.xlabel('Time Index')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()

# 5) Plot instrument 0 price with shaded pattern windows
plt.figure(figsize=(12, 4))
times = np.arange(n_train, n_train + n_test)
prices0 = np.exp(test_data[1])
plt.plot(times, prices0, label='Instrument 0 Price')
for end in matched_ends_inst0:
    start = n_train + end - WINDOW_SIZE
    stop = n_train + end
    plt.axvspan(start, stop, color='orange', alpha=0.1)
plt.title('Instrument 0 Test-Set Price with Pattern Matches')
plt.xlabel('Time Index')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 36                    # sliding window length
    N_PIPS = 8                          # total PIPs (we use N_PIPS-1 for feature)
    DIST_MEASURE = 2                    # distance measure for PIPs
    N_CLUSTERS = 10                     # number of clusters to form
    CLUSTER_THRESHOLD = 0.30            # max distance to centroid for assignment

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)   # shape (50, 750)
    n_inst, n_t = log_prices.shape

    # Split train/test
    n_train = 500
    train_data = log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    _, n_test = test_data.shape

    # 1) Train clusters on first 500 timesteps using N_PIPS-1 features (avoiding lookahead)
# Extract only N_PIPS-1 PIPs from each train window
train_patterns = Counter()
for inst in range(train_data.shape[0]):
    series = train_data[inst]
    for t in range(WINDOW_SIZE, train_data.shape[1]):
        window_past = series[t - WINDOW_SIZE : t]
        pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
        if len(pips_idx) < 2:
            continue
        vals = window_past[pips_idx]
        mn, mx = vals.min(), vals.max()
        norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
        train_patterns[tuple(np.round(norm,3))] += 1
train_vecs = list(train_patterns.keys())
kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
centers = kmeans.cluster_centers_

# 2) Evaluate test set without look-ahead
cluster_counts = Counter()
move_counts = Counter({cid: {'Up': 0, 'Down': 0} for cid in range(N_CLUSTERS)})

for inst in range(n_inst):
    series = test_data[inst]
    for t in range(WINDOW_SIZE, n_test):
        # (a) build past window excluding bar t
        window_past = series[t - WINDOW_SIZE : t]
        # (b) extract N_PIPS-1 PIPs from past data
        pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
        if len(pips_idx) < 2:
            continue
        vals = window_past[pips_idx]
        # normalize
        mn, mx = vals.min(), vals.max()
        norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
        # (c) assign cluster
        lbl = kmeans.predict([norm])[0]
        dist = np.linalg.norm(norm - centers[lbl][:N_PIPS-1])
        if dist > CLUSTER_THRESHOLD:
            continue
        # record cluster hit
        cluster_counts[lbl] += 1
        # (d) observe next bar at t and compare to last past PIP
        price_prev = series[t - 1]
        price_next = series[t]
        move = 'Up' if price_next > price_prev else 'Down'
        move_counts[lbl][move] += 1

# 3) Print results on test set
print("\nTest Set: Cluster ID -> Frequency")
for cid, freq in cluster_counts.most_common():
    print(f"Cluster {cid}: {freq}")

print("\nTest Set: Cluster ID -> Move Distribution")
for cid, cnts in move_counts.items():
    up, down = cnts['Up'], cnts['Down']
    if up + down:
        print(f"Cluster {cid}: Up = {up}, Down = {down}")

# 4) Visualize raw test-series with shaded hits for instrument 0
times = np.arange(n_train, n_train + n_test)
prices0 = np.exp(test_data[0])
plt.figure(figsize=(12, 4))
plt.plot(times, prices0, label='Instrument 0 Price')
for t in range(WINDOW_SIZE, n_test):
    window_past = test_data[0][t - WINDOW_SIZE : t]
    pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
    if len(pips_idx) < 2:
        continue
    vals = window_past[pips_idx]
    mn, mx = vals.min(), vals.max()
    norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
    lbl = kmeans.predict([norm])[0]
    dist = np.linalg.norm(norm - centers[lbl][:N_PIPS-1])
    if dist <= CLUSTER_THRESHOLD:
        start = n_train + t - WINDOW_SIZE
        stop  = n_train + t
        plt.axvspan(start, stop, color='orange', alpha=0.2)
plt.title('Instrument 0 Test-Set Price with Causal Pattern Matches')
plt.xlabel('Time Index')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()


print("\nTest Set: Cluster ID -> Frequency")
for cid, freq in cluster_counts.most_common():
    print(f"Cluster {cid}: {freq}")

print("\nTest Set: Cluster ID -> Move Distribution")
# sort by cluster ID to keep it consistent
for cid in sorted(move_counts):
    up = move_counts[cid]['Up']
    down = move_counts[cid]['Down']
    # only print clusters that actually had hits
    if up + down > 0:
        print(f"Cluster {cid}: Up = {up}, Down = {down}")

# ——— Now plot each cluster’s patterns + centroid ———
for cid, patterns in cluster_counts_test.items():
    if not patterns:
        continue

    plt.figure(figsize=(6, 4))
    # overlay all member patterns
    for pat in patterns:
        plt.plot(range(len(pat)), pat, alpha=0.2)
    # bold centroid
    plt.plot(range(len(centers[cid])),
                centers[cid],
                color='black',
                linewidth=2,
                label=f"Centroid {cid}")
    plt.title(f"Test Set Cluster {cid} (n={len(patterns)})")
    plt.xlabel("PIP Index")
    plt.ylabel("Normalized Value")
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def extract_pip_patterns(log_data: np.ndarray,
                         window_size: int,
                         n_pips: int,
                         dist_measure: int = 2) -> Counter:
    """
    Count normalized PIP patterns in the data.
    """
    n_inst, n_t = log_data.shape
    patterns = Counter()
    for inst in range(n_inst):
        series = log_data[inst]
        for end in range(window_size, n_t + 1):
            window = series[end - window_size:end]
            pips_idx = find_pips(window, n_pips, dist_measure)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            min_v, max_v = vals.min(), vals.max()
            norm = (vals - min_v) / (max_v - min_v) if max_v != min_v else np.zeros_like(vals)
            patterns[tuple(np.round(norm, 3))] += 1
    return patterns


def cluster_patterns(patterns: list, n_clusters: int) -> tuple[KMeans, np.ndarray, np.ndarray]:
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # Configuration
    PRICE_FILE = '../../prices.txt'      # whitespace-delimited txt (750 rows × 50 cols)
    WINDOW_SIZE = 50                    # sliding window length
    N_PIPS = 5                          # total PIPs (we use N_PIPS-1 for feature)
    DIST_MEASURE = 2                    # distance measure for PIPs
    N_CLUSTERS = 10                     # number of clusters to form
    CLUSTER_THRESHOLD = 0.95            # max distance to centroid for assignment

    # Load and log-transform data
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)   # shape (50, 750)
    n_inst, n_t = log_prices.shape

    # Split train/test
    n_train = 500
    train_data = log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    _, n_test = test_data.shape

    # 1) Train clusters on first 500 timesteps using N_PIPS-1 features (avoiding lookahead)
# Extract only N_PIPS-1 PIPs from each train window
train_patterns = Counter()
for inst in range(train_data.shape[0]):
    series = train_data[inst]
    for t in range(WINDOW_SIZE, train_data.shape[1]):
        window_past = series[t - WINDOW_SIZE : t]
        pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
        if len(pips_idx) < 2:
            continue
        vals = window_past[pips_idx]
        mn, mx = vals.min(), vals.max()
        norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
        train_patterns[tuple(np.round(norm,3))] += 1
train_vecs = list(train_patterns.keys())
kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
centers = kmeans.cluster_centers_

    # 2) Evaluate test set without look-ahead
    cluster_counts = Counter()
    move_counts = Counter({cid: {'Up': 0, 'Down': 0} for cid in range(N_CLUSTERS)})

    for inst in range(n_inst):
        series = test_data[inst]
        for t in range(WINDOW_SIZE, n_test):
            # (a) build past window excluding bar t
            window_past = series[t - WINDOW_SIZE : t]
            # (b) extract N_PIPS-1 PIPs from past data
            pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window_past[pips_idx]
            # normalize
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            # (c) assign cluster
            lbl = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl][:N_PIPS-1])
            if dist > CLUSTER_THRESHOLD:
                continue
            # record cluster hit
            cluster_counts[lbl] += 1
            # (d) observe next bar at t and compare to last past PIP
            price_prev = series[t - 1]
            price_next = series[t]
            move = 'Up' if price_next > price_prev else 'Down'
            move_counts[lbl][move] += 1

    # 3) Print results on test set
    print("\nTest Set: Cluster ID -> Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    print("\nTest Set: Cluster ID -> Move Distribution")
    for cid, cnts in move_counts.items():
        up, down = cnts['Up'], cnts['Down']
        if up + down:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # 4) Visualize raw test-series with shaded hits for instrument 0
    times = np.arange(n_train, n_train + n_test)
    prices0 = np.exp(test_data[0])
    plt.figure(figsize=(12, 4))
    plt.plot(times, prices0, label='Instrument 0 Price')
    for t in range(WINDOW_SIZE, n_test):
        window_past = test_data[0][t - WINDOW_SIZE : t]
        pips_idx = find_pips(window_past, N_PIPS - 1, DIST_MEASURE)
        if len(pips_idx) < 2:
            continue
        vals = window_past[pips_idx]
        mn, mx = vals.min(), vals.max()
        norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
        lbl = kmeans.predict([norm])[0]
        dist = np.linalg.norm(norm - centers[lbl][:N_PIPS-1])
        if dist <= CLUSTER_THRESHOLD:
            start = n_train + t - WINDOW_SIZE
            stop  = n_train + t
            plt.axvspan(start, stop, color='orange', alpha=0.2)
    plt.title('Instrument 0 Test-Set Price with Causal Pattern Matches')
    plt.xlabel('Time Index')
    plt.ylabel('Price')
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
# --- Grid Search for Best Pattern Clusters ---
window_sizes    = [10, 18, 36, 54]
n_pips_list     = [3, 5, 7, 9]
n_clusters_list = [8, 10, 12]
thresholds      = [0.25, 0.30, 0.35, 0.40]

results = []  # (edge, total, win, pips, k, th, cid)
combo_counter = 0
for WIN in window_sizes:
    for PIPS in n_pips_list:
        for K in n_clusters_list:
            for TH in thresholds:
                combo_counter += 1
                print(f"\n[Combo {combo_counter}] WIN={WIN}, PIPS={PIPS}, K={K}, TH={TH} → training…")

                # --- Train on causal (PIPS-1) features ---
                train_patterns = Counter()
                for inst in range(n_inst):
                    series = train_data[inst]
                    for t in range(WIN, n_train):
                        win_past = series[t - WIN : t]
                        idx = find_pips(win_past, PIPS - 1, DIST_MEASURE)
                        if len(idx) < 2:
                            continue
                        vals = win_past[idx]
                        mn, mx = vals.min(), vals.max()
                        vec = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
                        train_patterns[tuple(np.round(vec, 3))] += 1
                train_vecs = list(train_patterns.keys())
                if len(train_vecs) < K:
                    print("  ↳ skipped (not enough unique patterns for given K)")
                    continue
                kmeans, _, _ = cluster_patterns(train_vecs, K)
                centers = kmeans.cluster_centers_

                # --- Test on unseen data ---
                move_cnt = {cid: {'Up': 0, 'Down': 0} for cid in range(K)}
                tot_cnt  = Counter()
                for inst in range(n_inst):
                    series = test_data[inst]
                    for t in range(WIN, n_test):
                        win_past = series[t - WIN : t]
                        idx = find_pips(win_past, PIPS - 1, DIST_MEASURE)
                        if len(idx) < 2:
                            continue
                        vals = win_past[idx]
                        mn, mx = vals.min(), vals.max()
                        vec = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
                        cid = kmeans.predict([vec])[0]
                        dist = np.linalg.norm(vec - centers[cid][:PIPS-1])
                        if dist > TH:
                            continue
                        tot_cnt[cid] += 1
                        mv = 'Up' if series[t] > series[t-1] else 'Down'
                        move_cnt[cid][mv] += 1

                # --- Record edges per cluster ---
                best_edge_combo = 0
                for cid in range(K):
                    up   = move_cnt[cid]['Up']
                    down = move_cnt[cid]['Down']
                    tot  = up + down
                    if tot == 0:
                        continue
                    edge = abs(up - down) / tot
                    if edge > best_edge_combo:
                        best_edge_combo = edge
                    results.append((edge, tot, WIN, PIPS, K, TH, cid))
                print(f"  ↳ finished. Best edge in this combo = {best_edge_combo:.3f} (across its clusters)")

# sort by edge then total
results.sort(key=lambda x: (x[0], x[1]), reverse=True)
print("\n=========== TOP 10 PARAMETER + CLUSTER COMBINATIONS ===========")
print("edge\ttotal\twindow\tpips\tK\tthreshold\tcid")
for edge, tot, win, pips, k, th, cid in results[:10]:
    print(f"{edge:.3f}\t{tot}\t{win}\t{pips}\t{k}\t{th}\t{cid}")

In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance), 2=Perpendicular distance to chord, 3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i - (x2 - x1) * data[i] + x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den != 0 else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


# --- Grid Search for Best Pattern Clusters ---
window_sizes    = [10, 18, 36, 54]
n_pips_list     = [5, 7, 9]
n_clusters_list = [8, 10, 12]
thresholds      = [0.25, 0.30, 0.35, 0.40, 0.50]

results = []  # (edge, total, win, pips, k, th, cid)
combo_counter = 0
for WIN in window_sizes:
    for PIPS in n_pips_list:
        for K in n_clusters_list:
            for TH in thresholds:
                combo_counter += 1
                print(f"\n[Combo {combo_counter}] WIN={WIN}, PIPS={PIPS}, K={K}, TH={TH} → training…")

                # --- Train on causal (PIPS-1) features ---
                train_patterns = Counter()
                for inst in range(n_inst):
                    series = train_data[inst]
                    for t in range(WIN, n_train):
                        win_past = series[t - WIN : t]
                        idx = find_pips(win_past, PIPS - 1, DIST_MEASURE)
                        if len(idx) < 2:
                            continue
                        vals = win_past[idx]
                        mn, mx = vals.min(), vals.max()
                        vec = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
                        train_patterns[tuple(np.round(vec, 3))] += 1
                train_vecs = list(train_patterns.keys())
                if len(train_vecs) < K:
                    print("  ↳ skipped (not enough unique patterns for given K)")
                    continue
                kmeans, _, _ = cluster_patterns(train_vecs, K)
                centers = kmeans.cluster_centers_

                # --- Test on unseen data ---
                move_cnt = {cid: {'Up': 0, 'Down': 0} for cid in range(K)}
                tot_cnt  = Counter()
                for inst in range(n_inst):
                    series = test_data[inst]
                    for t in range(WIN, n_test):
                        win_past = series[t - WIN : t]
                        idx = find_pips(win_past, PIPS - 1, DIST_MEASURE)
                        if len(idx) < 2:
                            continue
                        vals = win_past[idx]
                        mn, mx = vals.min(), vals.max()
                        vec = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
                        cid = kmeans.predict([vec])[0]
                        dist = np.linalg.norm(vec - centers[cid][:PIPS-1])
                        if dist > TH:
                            continue
                        tot_cnt[cid] += 1
                        mv = 'Up' if series[t] > series[t-1] else 'Down'
                        move_cnt[cid][mv] += 1

                # --- Record edges per cluster ---
                best_edge_combo = 0
                for cid in range(K):
                    up   = move_cnt[cid]['Up']
                    down = move_cnt[cid]['Down']
                    tot  = up + down
                    if tot == 0:
                        continue
                    edge = abs(up - down) / tot
                    if edge > best_edge_combo:
                        best_edge_combo = edge
                    results.append((edge, tot, WIN, PIPS, K, TH, cid))
                print(f"  ↳ finished. Best edge in this combo = {best_edge_combo:.3f} (across its clusters)")

# sort by edge, then by total count (frequency)
results.sort(key=lambda x: (-x[0], -x[1]))
print("\n=========== TOP 20 PARAMETER + CLUSTER COMBINATIONS ===========")
print("edge\ttotal\twindow\tpips\tK\tthreshold\tcid")
for edge, tot, win, pips, k, th, cid in results[:192]:
    print(f"{edge:.3f}\t{tot}\t{win}\t{pips}\t{k}\t{th}\t{cid}")


In [None]:
#Include random seed in Grid Search

In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_pips(data: np.ndarray, n_pips: int, dist_measure: int = 2) -> list[int]:
    """
    Extract n_pips perceptually important points (PIPs) from a 1D series.
    dist_measure: 1=Euclidean (x-distance),
                  2=Perpendicular distance to chord,
                  3=Vertical distance to chord
    Returns sorted list of indices for PIPs within data.
    """
    if n_pips < 2:
        return []
    pips_x = [0, len(data) - 1]
    for _ in range(2, n_pips):
        max_dist = -1.0
        max_idx = None
        coords = sorted((x, data[x]) for x in pips_x)
        for (x1, y1), (x2, y2) in zip(coords[:-1], coords[1:]):
            for i in range(x1 + 1, x2):
                if dist_measure == 1:
                    dist = abs(i - (x1 + x2) / 2)
                elif dist_measure == 2:
                    num = abs((y2 - y1) * i -
                              (x2 - x1) * data[i] +
                              x2*y1 - y2*x1)
                    den = math.hypot(y2 - y1, x2 - x1)
                    dist = num / den if den else 0
                elif dist_measure == 3:
                    interp = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                    dist = abs(data[i] - interp)
                else:
                    raise ValueError(f"Unknown dist_measure {dist_measure}")
                if dist > max_dist:
                    max_dist = dist
                    max_idx = i
        if max_idx is None:
            break
        pips_x.append(max_idx)
    return sorted(pips_x)


def cluster_patterns(patterns: list, n_clusters: int):
    data = np.array(patterns)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(data)
    return kmeans, data, labels


if __name__ == '__main__':
    # ─── Configuration ───────────────────────────────────────────────────────
    PRICE_FILE       = '../../prices.txt'  # (750 rows × 50 cols, whitespace-delimited)
    WINDOW_SIZE      = 10                  # sliding window length
    N_PIPS           = 7                   # total PIPs (we use N_PIPS-1 for features)
    DIST_MEASURE     = 2                   # distance measure for PIPs
    N_CLUSTERS       = 12                  # number of clusters to form
    CLUSTER_THRESHOLD= 0.25                # max distance to centroid for assignment

    # ─── Load & pre-process ──────────────────────────────────────────────────
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)       # shape: (n_inst, 750)
    n_inst, n_t = log_prices.shape

    # split into train/test
    n_train   = 500
    train_data= log_prices[:, :n_train]
    test_data = log_prices[:, n_train:]
    _, n_test = test_data.shape

    # ─── 1) Train clusters on first 500 timesteps ───────────────────────────
    train_patterns = Counter()
    for inst in range(n_inst):
        series = train_data[inst]
        for t in range(WINDOW_SIZE, n_train):
            window = series[t - WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            train_patterns[tuple(np.round(norm, 3))] += 1

    train_vecs = list(train_patterns.keys())
    kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # ─── 2) Evaluate on test set (bars 501–750) ─────────────────────────────
    cluster_counts   = Counter()
    move_counts      = Counter({cid: {'Up': 0, 'Down': 0}
                                for cid in range(N_CLUSTERS)})
    cluster_examples = {cid: [] for cid in range(N_CLUSTERS)}

    for inst in range(n_inst):
        series = test_data[inst]
        for t in range(WINDOW_SIZE, n_test):
            window = series[t - WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue

            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)

            lbl  = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            # record
            cluster_counts[lbl] += 1
            cluster_examples[lbl].append(norm)

            # move Up/Down
            prev_p = series[t - 1]
            next_p = series[t]
            move = 'Up' if next_p > prev_p else 'Down'
            move_counts[lbl][move] += 1

    # ─── 3) Print summary ────────────────────────────────────────────────────
    print("\nTest Set: Cluster ID -> Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    print("\nTest Set: Cluster ID -> Move Distribution")
    for cid in sorted(move_counts):
        up   = move_counts[cid]['Up']
        down = move_counts[cid]['Down']
        if up + down:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # ─── 4) Plot each cluster’s overlaid patterns + centroid ────────────────
    for cid, patterns in cluster_examples.items():
        if not patterns:
            continue

        plt.figure(figsize=(6, 4))
        # all member patterns
        for pat in patterns:
            plt.plot(range(len(pat)), pat, alpha=0.2)

        # bold centroid
        plt.plot(range(len(centers[cid])),
                 centers[cid],
                 color='black',
                 linewidth=2,
                 label=f"Centroid {cid}")

        plt.title(f"Test Set Cluster {cid} (n={len(patterns)})")
        plt.xlabel("PIP Index")
        plt.ylabel("Normalized Value")
        plt.legend()
        plt.tight_layout()
        plt.show()


if __name__ == '__main__':
    # … your existing configuration & training code …

    # ─── 2) Evaluate on test set (bars 501–750) ─────────────────────────────
    cluster_counts   = Counter()
    move_counts      = Counter({cid: {'Up': 0, 'Down': 0}
                                for cid in range(N_CLUSTERS)})
    # now each example will be length (N_PIPS-1)+1
    cluster_examples = {cid: [] for cid in range(N_CLUSTERS)}

    for inst in range(n_inst):
        series = test_data[inst]
        for t in range(WINDOW_SIZE, n_test):
            window = series[t - WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue

            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)

            lbl  = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            # record frequency
            cluster_counts[lbl] += 1

            # ** new: normalize the *next* bar and append **
            next_price = series[t]
            next_norm  = (next_price - mn) / (mx - mn) if mx != mn else 0.0
            pattern_with_next = np.concatenate([norm, [next_norm]])
            cluster_examples[lbl].append(pattern_with_next)

            # record Up/Down
            move = 'Up' if next_price > series[t-1] else 'Down'
            move_counts[lbl][move] += 1

    # ─── 4) Plot each cluster’s overlaid patterns + centroid ────────────────
    for cid, patterns in cluster_examples.items():
        if not patterns:
            continue

        plt.figure(figsize=(6, 4))
        # plot every window+next
        for pat in patterns:
            plt.plot(range(len(pat)), pat, alpha=0.2)

        # plot **only** the original centroid over the first N_PIPS-1 points
        plt.plot(range(len(centers[cid])),
                 centers[cid],
                 color='black',
                 linewidth=2,
                 label=f"Centroid {cid}")

        # mark the “next bar” vertical for clarity
        plt.axvline(len(centers[cid]) - 0.5, color='gray', linestyle='--', alpha=0.5)

        plt.title(f"Test Set Cluster {cid} (n={len(patterns)})")
        plt.xlabel("Index (0…PIPs-2 are PIPs, last point is next bar)")
        plt.ylabel("Normalized Value")
        plt.legend()
        plt.tight_layout()
        plt.show()

In [None]:
if __name__ == '__main__':
    # ─── Configuration ───────────────────────────────────────────────────────
    PRICE_FILE        = '../../prices.txt'
    WINDOW_SIZE       = 10
    N_PIPS            = 7
    DIST_MEASURE      = 2
    N_CLUSTERS        = 12
    CLUSTER_THRESHOLD = 0.25

    # ─── Load & pre-process ──────────────────────────────────────────────────
    df = pd.read_csv(PRICE_FILE, sep=r"\s+", header=None)
    log_prices = np.log(df.values.T)       # shape: (n_inst, 750)
    n_inst, n_t = log_prices.shape

    n_train    = 500
    train_data = log_prices[:, :n_train]
    test_data  = log_prices[:, n_train:]
    _, n_test  = test_data.shape

    # ─── 1) Train clusters on first 500 timesteps ───────────────────────────
    train_patterns = Counter()
    for inst in range(n_inst):
        series = train_data[inst]
        for t in range(WINDOW_SIZE, n_train):
            window   = series[t - WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue
            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)
            train_patterns[tuple(np.round(norm, 3))] += 1

    train_vecs = list(train_patterns.keys())
    kmeans, _, _ = cluster_patterns(train_vecs, N_CLUSTERS)
    centers = kmeans.cluster_centers_

    # ─── 2) Single pass over test set (bars 501–750) ────────────────────────
    cluster_counts   = Counter()
    move_counts      = Counter({cid: {'Up': 0, 'Down': 0}
                                for cid in range(N_CLUSTERS)})
    cluster_returns  = {cid: [] for cid in range(N_CLUSTERS)}
    cluster_examples = {cid: [] for cid in range(N_CLUSTERS)}

    for inst in range(n_inst):
        series = test_data[inst]
        for t in range(WINDOW_SIZE, n_test):
            window   = series[t - WINDOW_SIZE : t]
            pips_idx = find_pips(window, N_PIPS - 1, DIST_MEASURE)
            if len(pips_idx) < 2:
                continue

            vals = window[pips_idx]
            mn, mx = vals.min(), vals.max()
            norm   = (vals - mn) / (mx - mn) if mx != mn else np.zeros_like(vals)

            lbl  = kmeans.predict([norm])[0]
            dist = np.linalg.norm(norm - centers[lbl])
            if dist > CLUSTER_THRESHOLD:
                continue

            # 2a) record counts & raw return
            cluster_counts[lbl] += 1
            ret = (series[t] - series[t-1]) / series[t-1]
            cluster_returns[lbl].append(ret)
            move = 'Up' if ret > 0 else 'Down'
            move_counts[lbl][move] += 1

            # 2b) store the normalized pattern _plus_ next bar
            next_norm = (series[t] - mn) / (mx - mn) if mx != mn else 0.0
            pattern_with_next = np.concatenate([norm, [next_norm]])
            cluster_examples[lbl].append(pattern_with_next)

    # ─── 3) Print frequency & move distributions ───────────────────────────
    print("\nTest Set: Cluster ID → Frequency")
    for cid, freq in cluster_counts.most_common():
        print(f"Cluster {cid}: {freq}")

    print("\nTest Set: Cluster ID → Move Distribution")
    for cid in sorted(move_counts):
        up, down = move_counts[cid]['Up'], move_counts[cid]['Down']
        if up + down:
            print(f"Cluster {cid}: Up = {up}, Down = {down}")

    # ─── 4) Compute p_up, E[up], E[down], expectancy ───────────────────────
    cluster_stats = {}
    for cid, rets in cluster_returns.items():
        if not rets:
            continue
        rets    = np.array(rets)
        mask_up = rets > 0
        p_up    = mask_up.mean()
        e_up    = rets[mask_up].mean()  if mask_up.any()  else 0.0
        e_down  = rets[~mask_up].mean() if (~mask_up).any() else 0.0
        expct   = p_up*e_up + (1-p_up)*e_down

        cluster_stats[cid] = {
            'count'     : len(rets),
            'p_up'      : p_up,
            'e_up'      : e_up,
            'e_down'    : e_down,
            'expectancy': expct
        }

    print("\nCluster  Count   P(up)    E[up]    E[down]   Expectancy")
    for cid in sorted(cluster_stats):
        s = cluster_stats[cid]
        print(f"{cid:>3d}    {s['count']:>4d}   "
              f"{s['p_up']:.2f}   "
              f"{s['e_up']:.4f}   "
              f"{s['e_down']:.4f}   "
              f"{s['expectancy']:.4f}")

    # ─── 5) Select tradeable clusters & plot their patterns ────────────────
    tradeable = [cid for cid,s in cluster_stats.items() if s['expectancy'] > 0]
    print(f"\nTradeable clusters (expectancy>0): {tradeable}")

    for cid in tradeable:
        patterns = cluster_examples[cid]
        if not patterns:
            continue

        plt.figure(figsize=(6, 4))
        # overlay every historical+next pattern
        for pat in patterns:
            plt.plot(range(len(pat)), pat, alpha=0.2)

        # centroid (only the PIP‐portion)
        plt.plot(range(len(centers[cid])),
                 centers[cid],
                 color='black',
                 linewidth=2,
                 label=f"Centroid {cid}")

        # divider between PIP and next bar
        plt.axvline(len(centers[cid]) - 0.5,
                    color='gray',
                    linestyle='--',
                    alpha=0.5)

        plt.title(f"Tradeable Cluster {cid} (n={len(patterns)})")
        plt.xlabel("Index (0…PIPs-2 are PIPs, last is next bar)")
        plt.ylabel("Normalized Value")
        plt.legend()
        plt.tight_layout()
        plt.show()
