In [1]:
#!/usr/bin/env python3
"""
Given data: [(his_len, max_attempts), ...] from Stage-1 topK for ONE instance,
produce Stage-2 refinement search space (1 or 2 clusters).

Rules:
1) Scale transform: z = log(his_len) (natural log). max_attempts unchanged.
2) Multi-modality detection: sort by z, compute adjacent gaps Δ.
   Cut ONLY at the largest gap if Δ_max >= tau (default tau=0.8).
3) For each cluster: build refinement interval per parameter using
   q10/q90 quantiles + buffer beta (default 0.2; if cluster size <=3, beta=0.3).
   Clip to coarse bounds.

Output: a dict with clusters and their (his_len_range, max_attempts_range),
plus diagnostics (max_gap, cut_index, cluster sizes).
"""

from __future__ import annotations
from dataclasses import dataclass
from math import log
from typing import List, Tuple, Optional, Dict, Any
import numpy as np


@dataclass(frozen=True)
class IntRange:
    lo: int
    hi: int


@dataclass(frozen=True)
class RefinementCluster:
    label: str
    points: List[Tuple[int, int]]  # (his_len, max_attempts)
    his_len_range: IntRange
    max_attempts_range: IntRange


def _clip_int(x: int, lo: int, hi: int) -> int:
    return max(lo, min(hi, x))


def _quantile_interval_int(
    values: List[int],
    q_lo: float,
    q_hi: float,
    beta: float,
    clip_lo: int,
    clip_hi: int,
) -> IntRange:
    arr = np.array(values, dtype=float)
    ql = float(np.quantile(arr, q_lo, method="linear"))
    qh = float(np.quantile(arr, q_hi, method="linear"))

    lo = int(np.floor((1.0 - beta) * ql))
    hi = int(np.ceil((1.0 + beta) * qh))

    lo = _clip_int(lo, clip_lo, clip_hi)
    hi = _clip_int(hi, clip_lo, clip_hi)
    if lo > hi:
        lo, hi = hi, lo
    return IntRange(lo, hi)


def refine_space_from_topk(
    data: List[Tuple[int, int]],
    *,
    tau: float = 0.8,
    q_lo: float = 0.10,
    q_hi: float = 0.90,
    beta_default: float = 0.20,
    beta_small_cluster: float = 0.30,
    coarse_his_len: Tuple[int, int] = (300, 20000),
    coarse_max_attempts: Tuple[int, int] = (1, 500),
) -> Dict[str, Any]:
    """
    Main entry: returns {"clusters": [...], "diagnostics": {...}}.
    """
    if len(data) < 2:
        raise ValueError("Need at least 2 points to run gap-based multi-modality detection.")

    # 1) Scale transform for his_len
    #    Store (z, his_len, max_attempts)
    transformed = []
    for (h, a) in data:
        if h <= 0:
            raise ValueError(f"his_len must be > 0 for log-transform, got {h}")
        transformed.append((log(h), h, a))

    # 2) Multi-modality detection on z=log(his_len)
    transformed.sort(key=lambda t: t[0])
    z = [t[0] for t in transformed]

    gaps = [z[i + 1] - z[i] for i in range(len(z) - 1)]
    max_gap = max(gaps)
    cut_index: Optional[int] = int(np.argmax(gaps))  # cut between cut_index and cut_index+1

    if max_gap >= tau:
        # Cut into 2 clusters
        clusters_raw = [
            ("mode_1", transformed[: cut_index + 1]),
            ("mode_2", transformed[cut_index + 1 :]),
        ]
    else:
        cut_index = None
        clusters_raw = [("mode_1", transformed)]

    # 3) Build refinement intervals for each cluster
    clusters: List[RefinementCluster] = []
    for label, rows in clusters_raw:
        pts = [(h, a) for (_, h, a) in rows]
        beta = beta_small_cluster if len(pts) <= 3 else beta_default

        his_vals = [h for (h, _) in pts]
        # att_vals = [a for (_, a) in pts]

        his_rng = _quantile_interval_int(
            his_vals, q_lo, q_hi, beta, coarse_his_len[0], coarse_his_len[1]
        )
        # att_rng = _quantile_interval_int(
        #     att_vals, q_lo, q_hi, beta, coarse_max_attempts[0], coarse_max_attempts[1]
        # )
        global_att_vals = [a for (_, a) in data]
        global_att_rng = _quantile_interval_int(
            global_att_vals,
            q_lo, q_hi,
            beta_default,
            coarse_max_attempts[0],
            coarse_max_attempts[1]
        )


        clusters.append(
            RefinementCluster(
                label=label,
                points=pts,
                his_len_range=his_rng,
                max_attempts_range=global_att_rng,
            )
        )

    # Prepare output
    out = {
        "clusters": [
            {
                "label": c.label,
                "size": len(c.points),
                "points": c.points,
                "his_len_range": (c.his_len_range.lo, c.his_len_range.hi),
                "max_attempts_range": (c.max_attempts_range.lo, c.max_attempts_range.hi),
            }
            for c in clusters
        ],
        "diagnostics": {
            "tau": tau,
            "q_lo": q_lo,
            "q_hi": q_hi,
            "beta_default": beta_default,
            "beta_small_cluster": beta_small_cluster,
            "coarse_his_len": coarse_his_len,
            "coarse_max_attempts": coarse_max_attempts,
            "max_log_gap": float(max_gap),
            "cut_index": cut_index,  # None if no cut
            "sorted_his_len": [h for (_, h, _) in transformed],
            "sorted_log_his_len": [float(v) for v in z],
            "adjacent_log_gaps": [float(g) for g in gaps],
        },
    }
    return out


def to_irace_parameters_txt(cluster: Dict[str, Any]) -> str:
    (h_lo, h_hi) = cluster["his_len_range"]
    (a_lo, a_hi) = cluster["max_attempts_range"]
    return (
        f'his_len      "-his_len "      i ({h_lo}, {h_hi})\n'
        f'max_attempts "-max_attempts " i ({a_lo}, {a_hi})\n'
    )


# -----------------------
# Example usage
# -----------------------
if __name__ == "__main__":
    # Example input: your top10 as (his_len, max_attempts)
    data = [
        (3258, 19),
        (17868, 170),
        (827, 87),
        (1191, 77),
        (18065, 186),
        (14482, 69),
        (1480, 107),
        (10788, 225),
        (12020, 256),
        (3188, 18),
    ]

    result = refine_space_from_topk(data, tau=0.8)

    print("Diagnostics:")
    print(f"  max_log_gap = {result['diagnostics']['max_log_gap']:.3f}")
    print(f"  cut_index   = {result['diagnostics']['cut_index']}")
    print(f"  sorted_his_len = {result['diagnostics']['sorted_his_len']}")
    print(f"  adjacent_log_gaps = {[round(x,3) for x in result['diagnostics']['adjacent_log_gaps']]}")
    print()

    print("Refinement spaces:")
    for c in result["clusters"]:
        print(f"- {c['label']} (n={c['size']}):")
        print(f"  his_len      in {c['his_len_range']}")
        print(f"  max_attempts in {c['max_attempts_range']}")
        print("  parameters.txt snippet:")
        print(to_irace_parameters_txt(c))


Diagnostics:
  max_log_gap = 1.197
  cut_index   = 4
  sorted_his_len = [827, 1191, 1480, 3188, 3258, 10788, 12020, 14482, 17868, 18065]
  adjacent_log_gaps = [0.365, 0.217, 0.767, 0.022, 1.197, 0.108, 0.186, 0.21, 0.011]

Refinement spaces:
- mode_1 (n=5):
  his_len      in (778, 3876)
  max_attempts in (15, 274)
  parameters.txt snippet:
his_len      "-his_len "      i (778, 3876)
max_attempts "-max_attempts " i (15, 274)

- mode_2 (n=5):
  his_len      in (9024, 20000)
  max_attempts in (15, 274)
  parameters.txt snippet:
his_len      "-his_len "      i (9024, 20000)
max_attempts "-max_attempts " i (15, 274)



In [None]:
91    10238          416
110   10209          440
56    10483          466
115   10429          456
112    9543          463
92    10038          459
101   10108          462
75     9949          455
58    10963          449
108   10713          466

In [3]:

data = [
(2155, 2),
(3238, 1),
(3347, 1),
(3263, 1),
(3438, 2),
(3243, 1),
(3448, 2),
(3246, 1),
(3068, 2),
(3368, 2),
]

result = refine_space_from_topk(data, tau=0.8)

print("Diagnostics:")
print(f"  max_log_gap = {result['diagnostics']['max_log_gap']:.3f}")
print(f"  cut_index   = {result['diagnostics']['cut_index']}")
print(f"  sorted_his_len = {result['diagnostics']['sorted_his_len']}")
print(f"  adjacent_log_gaps = {[round(x,3) for x in result['diagnostics']['adjacent_log_gaps']]}")
print()

print("Refinement spaces:")
for c in result["clusters"]:
    print(f"- {c['label']} (n={c['size']}):")
    print(f"  his_len      in {c['his_len_range']}")
    print(f"  max_attempts in {c['max_attempts_range']}")
    print("  parameters.txt snippet:")
    print(to_irace_parameters_txt(c))

Diagnostics:
  max_log_gap = 0.353
  cut_index   = None
  sorted_his_len = [2155, 3068, 3238, 3243, 3246, 3263, 3347, 3368, 3438, 3448]
  adjacent_log_gaps = [0.353, 0.054, 0.002, 0.001, 0.005, 0.025, 0.006, 0.021, 0.003]

Refinement spaces:
- mode_1 (n=10):
  his_len      in (2381, 4127)
  max_attempts in (1, 3)
  parameters.txt snippet:
his_len      "-his_len "      i (2381, 4127)
max_attempts "-max_attempts " i (1, 3)

