# Quality-Weight Bin Health Check

Validates `quality_weight` bounds, compares **old 3-bin** vs **new 6-bin** distributions,
cross-tabulates bins with confidence tiers, and sanity-checks report aggregation.

**Test case:** `t08_loop_heavy` across O0 / O1 / O2 / O3 (stripped variant).

In [12]:
import json
from pathlib import Path
from collections import Counter

import pandas as pd

# ── Load all joined_functions.jsonl for t08 ──
ARTIFACTS = Path(r"C:\Users\nico_\Documents\UNI\Thesis\Source\reforge\docker\local-files\artifacts\synthetic")
TEST_CASE = "t07_switch_parser"
OPTS = ["O0", "O1", "O2", "O3"]
VARIANT = "stripped"

rows = []
for opt in OPTS:
    jsonl_path = ARTIFACTS / TEST_CASE / opt / VARIANT / "join_oracles_ghidra" / "joined_functions.jsonl"
    if not jsonl_path.exists():
        print(f"MISSING: {jsonl_path}")
        continue
    with open(jsonl_path) as f:
        for line in f:
            row = json.loads(line)
            row["_opt"] = opt  # belt-and-suspenders
            rows.append(row)

df = pd.DataFrame(rows)
print(f"Loaded {len(df)} rows across {df['opt'].nunique()} opt levels")
df.groupby('opt').size()

Loaded 96 rows across 4 opt levels


opt
O0    22
O1    24
O2    25
O3    25
dtype: int64

## 1. Validate raw metric bounds

Assert every `quality_weight` is `None` or in `[0.0, 1.0]`.  List offenders.

In [13]:
# Bounds validation
offenders = df[
    (df["quality_weight"].notna())
    & ((df["quality_weight"] < 0.0) | (df["quality_weight"] > 1.0))
].copy()

print(f"quality_weight range: [{df['quality_weight'].min():.6f}, {df['quality_weight'].max():.6f}]")
print(f"Offenders (outside [0,1]): {len(offenders)}")

if len(offenders) > 0:
    print("\n  TOP 20 OFFENDERS — upstream bug still present:")
    display(offenders[
        ["test_case", "opt", "dwarf_function_id", "quality_weight",
         "align_overlap_ratio", "align_n_candidates", "align_verdict"]
    ].head(20))
else:
    print("✓ All quality_weight values in [0, 1]")

# Also check align_overlap_ratio
overlap_offenders = df[
    (df["align_overlap_ratio"].notna())
    & (df["align_overlap_ratio"] > 1.0)
]
print(f"\nalign_overlap_ratio > 1.0: {len(overlap_offenders)}")
if len(overlap_offenders) > 0:
    print(f"  max = {overlap_offenders['align_overlap_ratio'].max():.6f}")

quality_weight range: [0.000000, 1.000000]
Offenders (outside [0,1]): 0
✓ All quality_weight values in [0, 1]

align_overlap_ratio > 1.0: 0


## 2. Old vs New bin distributions

Compare the old 3-bin scheme with the new 6-bin (detailed) scheme.

In [14]:
import sys
sys.path.insert(0, str(Path(r"C:\Users\nico_\Documents\UNI\Thesis\Source\reforge")))

from data.binning import (
    quality_weight_bin,
    quality_weight_bin_detailed,
    overlap_ratio_bin,
    QUALITY_WEIGHT_BIN_ORDER,
    QUALITY_WEIGHT_BIN_DETAILED_ORDER,
)


# Old 3-bin (for comparison)
def old_quality_weight_bin(qw: float) -> str:
    if qw >= 0.8:
        return "[0.8,1.0]"
    if qw >= 0.5:
        return "[0.5,0.8)"
    return "[0,0.5)"


# Apply old bins
df["old_bin"] = df["quality_weight"].apply(old_quality_weight_bin)

# Apply new simple bins (qw=None for non-MATCH)
df["new_bin"] = df.apply(
    lambda r: quality_weight_bin(
        r["quality_weight"] if r["align_verdict"] == "MATCH" else None
    ),
    axis=1,
)

# Apply new detailed bins
df["new_bin_detailed"] = df.apply(
    lambda r: quality_weight_bin_detailed(
        r["quality_weight"] if r["align_verdict"] == "MATCH" else None,
        has_range=r.get("exclusion_reason") != "NO_RANGE",
        align_verdict=r.get("align_verdict"),
    ),
    axis=1,
)

print("=== OLD 3-bin distribution ===")
print(df.groupby(["opt", "old_bin"]).size().unstack(fill_value=0))

print("\n=== NEW 6-bin (simple) distribution ===")
print(df.groupby(["opt", "new_bin"]).size().unstack(fill_value=0)
      .reindex(columns=QUALITY_WEIGHT_BIN_ORDER, fill_value=0))

print("\n=== NEW detailed distribution ===")
print(df.groupby(["opt", "new_bin_detailed"]).size().unstack(fill_value=0)
      .reindex(columns=QUALITY_WEIGHT_BIN_DETAILED_ORDER, fill_value=0))

=== OLD 3-bin distribution ===
old_bin  [0,0.5)  [0.8,1.0]
opt                        
O0            10         12
O1            12         12
O2            19          6
O3            19          6

=== NEW 6-bin (simple) distribution ===
new_bin  ==1.0  [0.95,1.0)  [0.8,0.95)  [0.5,0.8)  [0,0.5)  none
opt                                                             
O0          12           0           0          0        0    10
O1          12           0           0          0        0    12
O2           6           0           0          0        3    16
O3           6           0           0          0        2    17

=== NEW detailed distribution ===
new_bin_detailed  ==1.0  [0.95,1.0)  [0.8,0.95)  [0.5,0.8)  [0,0.5)  \
opt                                                                   
O0                   12           0           0          0        0   
O1                   12           0           0          0        0   
O2                    6           0           0    

## 3. Bin → Confidence tier cross-tabulation

For each detailed bin, compute fraction that are:
- `align_verdict == MATCH`
- `align_n_candidates == 1`
- `confidence_tier == GOLD`
- `is_high_confidence == True`

In [15]:
# Cross-tabulation: bin → confidence signals
def bin_stats(group):
    n = len(group)
    return pd.Series({
        "n": n,
        "frac_match": (group["align_verdict"] == "MATCH").sum() / max(n, 1),
        "frac_unique": (group["align_n_candidates"] == 1).sum() / max(n, 1),
        "frac_gold": (group["confidence_tier"] == "GOLD").sum() / max(n, 1),
        "frac_hc": group["is_high_confidence"].sum() / max(n, 1),
    })

cross = df.groupby("new_bin_detailed").apply(bin_stats)
cross = cross.reindex(QUALITY_WEIGHT_BIN_DETAILED_ORDER)
cross["n"] = cross["n"].fillna(0).astype(int)
for col in ["frac_match", "frac_unique", "frac_gold", "frac_hc"]:
    cross[col] = cross[col].fillna(0).map("{:.1%}".format)

print("Bin → confidence tier consistency:")
display(cross)

Bin → confidence tier consistency:


Unnamed: 0_level_0,n,frac_match,frac_unique,frac_gold,frac_hc
new_bin_detailed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
==1.0,36,100.0%,100.0%,94.4%,94.4%
"[0.95,1.0)",0,0.0%,0.0%,0.0%,0.0%
"[0.8,0.95)",0,0.0%,0.0%,0.0%,0.0%
"[0.5,0.8)",0,0.0%,0.0%,0.0%,0.0%
"[0,0.5)",5,100.0%,0.0%,0.0%,0.0%
none_not_match,4,0.0%,0.0%,0.0%,0.0%
none_no_range,51,0.0%,0.0%,0.0%,0.0%


## 4. Sanity-check: recomputed bins vs join_report.json

Recompute `yield_by_quality_weight_bin` from the JSONL and compare to the
stored `join_report.json`.  Note: reports were generated with the **old**
3-bin scheme, so we compare against that.

In [5]:
# Compare recomputed old-bins to stored report (old scheme)
for opt in OPTS:
    report_path = ARTIFACTS / TEST_CASE / opt / VARIANT / "join_oracles_ghidra" / "join_report.json"
    if not report_path.exists():
        print(f"MISSING report: {report_path}")
        continue

    with open(report_path) as f:
        report = json.load(f)

    stored_bins = report.get("yield_by_quality_weight_bin", {})
    n_dwarf = report["yield_counts"]["n_dwarf_funcs"]

    # Recompute from JSONL using old scheme
    opt_df = df[df["opt"] == opt]
    recomputed = dict(opt_df["old_bin"].value_counts())

    match = recomputed == stored_bins
    symbol = "✓" if match else "✗"

    bin_sum = sum(stored_bins.values())
    partition_ok = bin_sum == n_dwarf
    p_symbol = "✓" if partition_ok else "✗"

    print(
        f"{opt}: report={stored_bins}  recomputed={recomputed}  "
        f"match={symbol}  sum={bin_sum}  n_dwarf={n_dwarf}  partition={p_symbol}"
    )

O0: report={'[0,0.5)': 13, '[0.8,1.0]': 16}  recomputed={'[0.8,1.0]': np.int64(16), '[0,0.5)': np.int64(13)}  match=✓  sum=29  n_dwarf=29  partition=✓
O1: report={'[0,0.5)': 15, '[0.8,1.0]': 16}  recomputed={'[0.8,1.0]': np.int64(16), '[0,0.5)': np.int64(15)}  match=✓  sum=31  n_dwarf=31  partition=✓
O2: report={'[0,0.5)': 19, '[0.8,1.0]': 14}  recomputed={'[0,0.5)': np.int64(19), '[0.8,1.0]': np.int64(14)}  match=✓  sum=33  n_dwarf=33  partition=✓
O3: report={'[0,0.5)': 19, '[0.8,1.0]': 14}  recomputed={'[0,0.5)': np.int64(19), '[0.8,1.0]': np.int64(14)}  match=✓  sum=33  n_dwarf=33  partition=✓


## 5. Overlap ratio distribution (new)

Show `align_overlap_ratio` binned with the same thresholds, alongside
`n_candidates` bins — to catch cases where "qw looks fine" masks
"overlap is high but candidates=3".

In [6]:
from data.binning import OVERLAP_RATIO_BIN_ORDER

df["overlap_bin"] = df.apply(
    lambda r: overlap_ratio_bin(
        r["align_overlap_ratio"]
        if r["align_verdict"] == "MATCH"
        else None
    ),
    axis=1,
)

print("=== align_overlap_ratio bin distribution ===")
print(df.groupby(["opt", "overlap_bin"]).size().unstack(fill_value=0)
      .reindex(columns=OVERLAP_RATIO_BIN_ORDER, fill_value=0))

# Cross: overlap_bin × n_candidates for MATCH rows
match_df = df[df["align_verdict"] == "MATCH"].copy()
if not match_df.empty:
    print("\n=== overlap_bin × n_candidates (MATCH rows only) ===")
    print(pd.crosstab(match_df["overlap_bin"], match_df["align_n_candidates"]))

=== align_overlap_ratio bin distribution ===
overlap_bin  ==1.0  [0.95,1.0)  [0.8,0.95)  [0.5,0.8)  [0,0.5)  none
opt                                                                 
O0              16           0           0          0        0    13
O1              16           0           0          0        0    15
O2              14           0           0          1        0    18
O3              14           0           1          1        0    17

=== overlap_bin × n_candidates (MATCH rows only) ===
align_n_candidates  1.0  2.0
overlap_bin                 
==1.0                60    0
[0.5,0.8)             0    2
[0.8,0.95)            0    1
