In [1]:
import re
import json
import os
import time

import Load_Data
import LLM_setting
import Chunking
import itertools
import pandas as pd
import Data_Processing as dp
from Data_Processing import merge_topics_frequency_based as m

from dotenv import load_dotenv, find_dotenv
from dataclasses import dataclass, asdict
from collections import Counter, defaultdict, deque
from typing import List, Optional, Dict, Any , Sequence, Union
from pathlib import Path
from datetime import datetime, timezone
from difflib import SequenceMatcher
from langchain_groq import ChatGroq
from langchain_community.document_loaders import BSHTMLLoader

In [2]:
_ = load_dotenv(find_dotenv())

In [3]:
#LLM Setting
model = "openai/gpt-oss-120b"
temperature = 0
llm_setting = LLM_setting.LLMSetting(MODEL=model, TEMPERATURE=temperature)
llm = LLM_setting.setting(llm_setting)

In [4]:
# Read Data
file_path = Path("data/raw_filings/320193_000032019325000079_aapl-20250927.htm")
cfg = Load_Data.LoadData(file_path=file_path)
docs = Load_Data.load_data(cfg)

In [5]:
# Chunking
chunk_size= 2500
chunk_overlap = 250
data_processing=Chunking.DataProcessing(file_path,chunk_size,chunk_overlap)
doc_chunk =Chunking.chunks(data_processing)

In [6]:
TOPICS_PER_CHUNK = 8
KEYWORDS_PER_TOPIC = 8
FINAL_K_TOPICS = 10

In [7]:
input_text = (
    "We experienced strong net sales growth driven by iPhone and Services. "
    "Supply constraints eased. Foreign exchange impacted revenue. "
    "We repurchased shares and increased the dividend."
)
output_json = {
    "topics": [
        {"label": "Product Revenue Drivers",
         "keywords": ["iPhone", "Services", "net sales", "growth", "revenue", "demand", "mix", "products"]},
        {"label": "Supply Chain Conditions",
         "keywords": ["supply", "constraints", "inventory", "availability", "production", "lead times", "logistics",
                      "capacity"]},
        {"label": "Foreign Exchange Effects",
         "keywords": ["foreign exchange", "currency", "FX", "headwind", "translation", "rates", "impact", "revenue"]},
        {"label": "Capital Return Program",
         "keywords": ["share repurchase", "buyback", "dividend", "capital return", "shareholders", "authorization",
                      "cash", "stock"]},
        {"label": "Operating Performance",
         "keywords": ["margin", "profitability", "operating income", "expenses", "costs", "efficiency", "performance",
                      "results"]},
        {"label": "Market Conditions",
         "keywords": ["macroeconomic", "consumer", "competition", "pricing", "market", "uncertainty", "trends",
                      "demand"]},
        {"label": "Guidance and Outlook",
         "keywords": ["outlook", "expectations", "guidance", "future", "forecast", "assumptions", "trend", "risks"]},
        {"label": "Shareholder Value",
         "keywords": ["value", "returns", "EPS", "share count", "capital allocation", "liquidity", "cash flow",
                      "investment"]},
    ]
}

In [8]:
OUTPUT_DIR = Path("outputs/ablation_runs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [9]:
_JSON_OBJ_RE = re.compile(r"\{.*\}", flags=re.DOTALL)

In [10]:
@dataclass(frozen=True)
class Exemplar:
    input_text: str
    output_json: Dict[str, Any]

In [11]:
@dataclass(frozen=True)
class Delimiter:
    open: str = "<TEXT>"
    close: str = "</TEXT>"

In [12]:
@dataclass(frozen=True)
class PromptComponents:
    role: Optional[str]
    instruction: str
    rubric: Optional[str]
    exemplars: Sequence[Exemplar]
    delimiter: Delimiter

In [13]:

@dataclass(frozen=True)
class PromptVariant:
    variant_id: str
    components: PromptComponents
def default_instruction() -> str:
    return "Identify and label the main topics in this corpus."

def default_role_financial_analyst() -> str:
    return "You are a financial analyst."

def default_rubric(topics_per_chunk: int, keywords_per_topic: int) -> str:

    schema_line = 'Schema: {"topics":[{"label":str,"keywords":[str,...]}]}'
    return (
        "Rubric:\n"
        "- Topics should be coherent, specific, and meaningful for financial reporting.\n"
        "- Topic labels should be short noun phrases (avoid generic labels like 'Other').\n"
        "- Topics should be mutually distinct (minimise keyword overlap).\n"
        "- Prefer substantive business/financial themes over boilerplate artifacts.\n"
        "- Avoid pure dates, identifiers, XBRL-like tags, or URL-like tokens as keywords.\n\n"
        "Output constraints:\n"
        "Use ONLY the text inside the delimiter.\n"
        f"Return exactly {topics_per_chunk} topics.\n"
        f"Each topic must have exactly {keywords_per_topic} keywords.\n"
        "Output MUST be valid JSON only (no Markdown, no extra text).\n"
        f"{schema_line}\n"
    )

def default_exemplars(input_text:Any,output_json:Dict) -> List[Exemplar]:
    return [
        Exemplar(
            input_text,
            output_json,
        )
    ]
def build_factorised_prompt(
    variant: PromptVariant,
    chunk_text: str,
    topics_per_chunk: int,
    keywords_per_topic: int,
) -> str:
    c = variant.components

    parts: List[str] = []

    # 1) role
    if c.role:
        parts.append(c.role.strip())

    # 2) instruction
    parts.append(c.instruction.strip())

    # 3) rubric
    if c.rubric:
        parts.append(c.rubric.strip())

    # 4) exemplars (optional)
    if c.exemplars:
        ex_lines: List[str] = ["Exemplars:"]
        for i, ex in enumerate(c.exemplars, start=1):
            ex_out = json.dumps(ex.output_json, ensure_ascii=False)
            ex_lines.append(f"[Example {i}]")
            ex_lines.append("Input:")
            ex_lines.append(ex.input_text.strip())
            ex_lines.append("Output (JSON):")
            ex_lines.append(ex_out)
            ex_lines.append("")  # spacer
        parts.append("\n".join(ex_lines).strip())

    # 5) delimiter
    d = c.delimiter
    parts.append(f"{d.open}\n{chunk_text}\n{d.close}")

    return "\n\n".join([p for p in parts if p and p.strip()])

In [14]:
def _as_text(obj: Any) -> str:
    if hasattr(obj, "page_content"):
        return obj.page_content
    if hasattr(obj, "chunk"):
        ch = getattr(obj, "chunk", None)
        if hasattr(ch, "page_content"):
            return ch.page_content
        return str(ch)
    return str(obj)

In [15]:
def _parse_json_loose(s: str) -> Dict[str, Any]:
    s = (s or "").strip()
    if not s:
        return {}
    try:
        return json.loads(s)
    except Exception:
        pass
    m = _JSON_OBJ_RE.search(s)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return {}
    return {}

In [16]:
def extract_topics_for_chunk_factorised(
    llm,
    variant: PromptVariant,
    chunk: Any,
    topics_per_chunk: int,
    keywords_per_topic: int,
) -> List[Dict[str, Any]]:
    text = _as_text(chunk)

    c = variant.components
    rubric = c.rubric
    if rubric is None:
        rubric = default_rubric(topics_per_chunk, keywords_per_topic)
        variant = PromptVariant(
            variant_id=variant.variant_id,
            components=PromptComponents(
                role=c.role,
                instruction=c.instruction,
                rubric=rubric,
                exemplars=c.exemplars,
                delimiter=c.delimiter,
            ),
        )

    prompt = build_factorised_prompt(
        variant=variant,
        chunk_text=text,
        topics_per_chunk=topics_per_chunk,
        keywords_per_topic=keywords_per_topic,
    )

    TOKEN_TRACKER.throttle_if_needed()
    if MIN_SLEEP_SECONDS:
        time.sleep(MIN_SLEEP_SECONDS)

    resp = invoke_with_backoff(llm, prompt)
    content = getattr(resp, "content", str(resp))

    _ = TOKEN_TRACKER.record(prompt=prompt, completion=content, resp=resp)

    data = _parse_json_loose(content)

    topics = data.get("topics", [])
    if not isinstance(topics, list):
        return []

    cleaned: List[Dict[str, Any]] = []
    for t in topics:
        if not isinstance(t, dict):
            continue
        label = str(t.get("label", "")).strip()
        kws = t.get("keywords", [])
        if not label or not isinstance(kws, list):
            continue
        keywords = [str(k).strip() for k in kws if str(k).strip()]

        if len(keywords) != keywords_per_topic:
            continue
        cleaned.append({"label": label, "keywords": keywords})

    if len(cleaned) != topics_per_chunk:
        return []

    return cleaned


In [17]:
def _norm_label(label: str) -> str:
    label = label.strip().lower()
    label = re.sub(r"\s+", " ", label)
    label = re.sub(r"[^\w\s\-&/]", "", label)
    return label.strip()

In [18]:
def merge_topics_frequency_based(
    per_chunk_topics: List[List[Dict[str, Any]]],
    final_k_topics: int,
    keywords_per_topic: int,
) -> List[Dict[str, Any]]:
    from collections import Counter, defaultdict

    label_support = Counter()
    label_canonical: Dict[str, str] = {}
    kw_counts: Dict[str, Counter] = defaultdict(Counter)

    for chunk_topics in per_chunk_topics:
        for t in chunk_topics:
            label = str(t.get("label", "")).strip()
            if not label:
                continue
            norm = _norm_label(label)
            if not norm:
                continue

            label_support[norm] += 1
            label_canonical.setdefault(norm, label)

            for kw in t.get("keywords", []) or []:
                kw_s = str(kw).strip()
                if kw_s:
                    kw_counts[norm][kw_s] += 1

    ranked_labels = sorted(label_support.items(), key=lambda x: (-x[1], x[0]))
    selected = ranked_labels[: max(1, final_k_topics)]

    merged: List[Dict[str, Any]] = []
    for norm, support in selected:
        top_keywords = [k for k, _ in kw_counts[norm].most_common(keywords_per_topic)]
        merged.append(
            {
                "label": label_canonical.get(norm, norm),
                "keywords": top_keywords,
                "support_chunks": int(support),
            }
        )
    return merged

In [19]:
def run_prompt_space_factorisation(
    *,
    llm,
    doc_chunks: List[Any],
    variant: PromptVariant,
    topics_per_chunk: int,
    keywords_per_topic: int,
    final_k_topics: int,
) -> Dict[str, Any]:
    per_chunk_topics: List[List[Dict[str, Any]]] = []

    for chunk in doc_chunks:
        topics = extract_topics_for_chunk_factorised(
            llm=llm,
            variant=variant,
            chunk=chunk,
            topics_per_chunk=topics_per_chunk,
            keywords_per_topic=keywords_per_topic,
        )
        per_chunk_topics.append(topics)

    merged_topics = merge_topics_frequency_based(
        per_chunk_topics=per_chunk_topics,
        final_k_topics=final_k_topics,
        keywords_per_topic=keywords_per_topic,
    )

    return {
        "stage": "prompt_space_factorisation",
        "variant_id": variant.variant_id,
        "num_chunks": len(doc_chunks),
        "topics_per_chunk": topics_per_chunk,
        "keywords_per_topic": keywords_per_topic,
        "final_k_topics": final_k_topics,
        "topics": merged_topics,
    }

In [20]:
def make_default_factorised_variant(
    *,
    include_exemplars: bool = True,
    delimiter_open: str = "<TEXT>",
    delimiter_close: str = "</TEXT>",
) -> PromptVariant:
    comps = PromptComponents(
        role=default_role_financial_analyst(),
        instruction=default_instruction(),
        rubric=None,
        exemplars=(default_exemplars(input_text,output_json) if include_exemplars else []),
        delimiter=Delimiter(open=delimiter_open, close=delimiter_close),
    )
    return PromptVariant(variant_id="factorised_default", components=comps)

In [21]:
def make_ablation_variants(
    base_variant: PromptVariant,
    *,
    include_role: bool = True,
    include_rubric: bool = True,
    include_exemplars: bool = True,
    include_delimiter: bool = True,
) -> PromptVariant:
    c = base_variant.components

    role = c.role if include_role else None
    rubric = (c.rubric if c.rubric is not None else None) if include_rubric else ""
    exemplars = c.exemplars if include_exemplars else []
    delimiter = c.delimiter if include_delimiter else Delimiter(open="", close="")

    comps = PromptComponents(
        role=role,
        instruction=c.instruction,
        rubric=rubric,
        exemplars=exemplars,
        delimiter=delimiter,
    )

    off = []
    if not include_role: off.append("role")
    if not include_rubric: off.append("rubric")
    if not include_exemplars: off.append("exemplars")
    if not include_delimiter: off.append("delimiter")
    variant_id = "abl_full" if not off else "abl_minus_" + "_".join(off)
    return PromptVariant(variant_id=variant_id, components=comps)


def generate_ablation_variants_from_base(base_variant: PromptVariant) -> List[PromptVariant]:

    variants: List[PromptVariant] = []
    for mask in range(16):
        variants.append(
            make_ablation_variants(
                base_variant,
                include_role=bool(mask & 1),
                include_rubric=bool(mask & 2),
                include_exemplars=bool(mask & 4),
                include_delimiter=bool(mask & 8),
            )
        )
    return variants


In [22]:
def run_ablation_extraction_only(
    *,
    llm,
    doc_chunks: List[Any],
    base_variant: PromptVariant,
    topics_per_chunk: int,
    keywords_per_topic: int,
    final_k_topics: int,
    output_dir: Path,
) -> pd.DataFrame:
    output_dir.mkdir(parents=True, exist_ok=True)
    variants = generate_ablation_variants_from_base(base_variant)
    rows = []

    for v in variants:
        per_chunk_topics: List[List[Dict[str, Any]]] = []
        parse_failures = 0

        for chunk in doc_chunks:
            topics = extract_topics_for_chunk_factorised(
                llm=llm,
                variant=v,
                chunk=chunk,
                topics_per_chunk=topics_per_chunk,
                keywords_per_topic=keywords_per_topic,
            )
            if not topics:
                parse_failures += 1
            per_chunk_topics.append(topics)

        merged = merge_topics_frequency_based(
            per_chunk_topics=per_chunk_topics,
            final_k_topics=final_k_topics,
            keywords_per_topic=keywords_per_topic,
        )

        run = {
            "stage": "ablation_extraction_only",
            "variant_id": v.variant_id,
            "num_chunks": len(doc_chunks),
            "topics_per_chunk": topics_per_chunk,
            "keywords_per_topic": keywords_per_topic,
            "final_k_topics": final_k_topics,
            "parse_failures": int(parse_failures),
            "topics": merged,
            "per_chunk_topics": per_chunk_topics,
        }

        ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        out_path = output_dir / f"{ts}_{v.variant_id}.json"
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(run, f, ensure_ascii=False, indent=2)

        rows.append({
            "variant_id": v.variant_id,
            "parse_failures": int(parse_failures),
            "out_file": str(out_path),
        })

    manifest = pd.DataFrame(rows).sort_values(["parse_failures", "variant_id"], ascending=[True, True])
    manifest.to_csv(output_dir / "manifest.csv", index=False)
    return manifest

base_variant = make_default_factorised_variant(include_exemplars=True)
variants = generate_ablation_variants_from_base(base_variant)
variants[:3]


[PromptVariant(variant_id='abl_minus_role_rubric_exemplars_delimiter', components=PromptComponents(role=None, instruction='Identify and label the main topics in this corpus.', rubric='', exemplars=[], delimiter=Delimiter(open='', close=''))),
 PromptVariant(variant_id='abl_minus_rubric_exemplars_delimiter', components=PromptComponents(role='You are a financial analyst.', instruction='Identify and label the main topics in this corpus.', rubric='', exemplars=[], delimiter=Delimiter(open='', close=''))),
 PromptVariant(variant_id='abl_minus_role_exemplars_delimiter', components=PromptComponents(role=None, instruction='Identify and label the main topics in this corpus.', rubric=None, exemplars=[], delimiter=Delimiter(open='', close='')))]

In [23]:
def utc_stamp():

    return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

def done_file(output_dir: Path) -> Path:
    return output_dir / "done_variants.json"

def load_done(output_dir: Path) -> set:
    done_path = output_dir / "done_variants.json"
    if done_path.exists():
        return set(json.loads(done_path.read_text(encoding="utf-8")))
    return set()

def save_done(output_dir: Path, done_set: set):
    done_path = output_dir / "done_variants.json"
    done_path.write_text(json.dumps(sorted(done_set), indent=2), encoding="utf-8")


TPM_LIMIT = int(os.getenv("TPM_LIMIT", "0"))
TPM_SAFETY = float(os.getenv("TPM_SAFETY", "0.90"))
MIN_SLEEP_SECONDS = float(os.getenv("MIN_SLEEP_SECONDS", "2"))

def _estimate_tokens(text: str) -> int:

    if not text:
        return 0
    return max(1, int(len(text) / 4))

def _extract_usage(resp) -> Dict[str, Any]:

    usage = getattr(resp, "usage", None)
    if isinstance(usage, dict) and usage:
        return usage

    for attr in ("response_metadata", "additional_kwargs", "metadata"):
        md = getattr(resp, attr, None)
        if isinstance(md, dict) and md:

            for k in ("token_usage", "usage"):
                if k in md and isinstance(md[k], dict):
                    return md[k]

            if any(x in md for x in ("prompt_tokens", "completion_tokens", "total_tokens")):
                return {
                    "prompt_tokens": md.get("prompt_tokens"),
                    "completion_tokens": md.get("completion_tokens"),
                    "total_tokens": md.get("total_tokens"),
                }

    return {}

class TokenTracker:
    def __init__(self, tpm_limit: int = 0, window_seconds: int = 60):
        self.tpm_limit = int(tpm_limit or 0)
        self.window_seconds = int(window_seconds)
        self._events = deque()  # (timestamp, total_tokens)
        self.prompt_tokens = 0
        self.completion_tokens = 0
        self.total_tokens = 0
        self.estimated_calls = 0
        self.total_calls = 0

    def reset(self):
        self._events.clear()
        self.prompt_tokens = 0
        self.completion_tokens = 0
        self.total_tokens = 0
        self.estimated_calls = 0
        self.total_calls = 0

    def _prune(self, now: float):
        cutoff = now - self.window_seconds
        while self._events and self._events[0][0] < cutoff:
            self._events.popleft()

    def tpm(self) -> int:
        now = time.time()
        self._prune(now)
        return int(sum(t for _, t in self._events))

    def record(self, *, prompt: str, completion: str, resp=None) -> Dict[str, Any]:

        self.total_calls += 1

        usage = _extract_usage(resp) if resp is not None else {}
        p = usage.get("prompt_tokens")
        c = usage.get("completion_tokens")
        t = usage.get("total_tokens")

        estimated = False

        if t is None:
            p = _estimate_tokens(prompt) if p is None else int(p)
            c = _estimate_tokens(completion) if c is None else int(c)
            t = int(p) + int(c)
            estimated = True
            self.estimated_calls += 1
        else:

            t = int(t)
            p = int(p) if p is not None else None
            c = int(c) if c is not None else None

        if p is not None:
            self.prompt_tokens += int(p)
        if c is not None:
            self.completion_tokens += int(c)
        self.total_tokens += int(t)

        now = time.time()
        self._events.append((now, int(t)))
        self._prune(now)

        return {
            "prompt_tokens": p,
            "completion_tokens": c,
            "total_tokens": t,
            "estimated": estimated,
        }

    def throttle_if_needed(self):
        if not self.tpm_limit:
            return
        target = int(self.tpm_limit * TPM_SAFETY)
        while self.tpm() > target:
            time.sleep(1.0)

    def summary(self) -> str:
        return (
            f"calls={self.total_calls} | total_tokens={self.total_tokens} "
            f"(prompt={self.prompt_tokens}, completion={self.completion_tokens}) | "
            f"tpm_last_60s={self.tpm()} | estimated_calls={self.estimated_calls}"
        )

TOKEN_TRACKER = TokenTracker(tpm_limit=TPM_LIMIT)

def invoke_with_backoff(llm, prompt: str, max_retries: int = 6):
    delay = 2.0
    for attempt in range(max_retries):
        try:
            return llm.invoke(prompt)
        except Exception as e:
            msg = str(e).lower()
            if ("rate limit" in msg) or ("429" in msg) or ("rate_limit" in msg):
                time.sleep(delay)
                delay = min(delay * 2.0, 60.0)
                continue
            raise
    raise RuntimeError("Too many rate-limit retries (429).")


In [24]:
def run_ablation_next_n(
    *,
    llm,
    doc_chunks: List[Any],
    base_variant: PromptVariant,
    topics_per_chunk: int,
    keywords_per_topic: int,
    final_k_topics: int,
    output_dir: Path,
    n: int = 2,
) -> pd.DataFrame:
    output_dir.mkdir(parents=True, exist_ok=True)

    TOKEN_TRACKER.reset()

    variants = generate_ablation_variants_from_base(base_variant)
    done = load_done(output_dir)

    todo = [v for v in variants if v.variant_id not in done][:n]
    if not todo:
        print("All ablation variants already completed.")
        return pd.DataFrame()

    rows = []

    for v in todo:
        start_tokens = TOKEN_TRACKER.total_tokens
        start_prompt = TOKEN_TRACKER.prompt_tokens
        start_completion = TOKEN_TRACKER.completion_tokens
        per_chunk_topics: List[List[Dict[str, Any]]] = []
        parse_failures = 0

        for chunk in doc_chunks:
            topics = extract_topics_for_chunk_factorised(
                llm=llm,
                variant=v,
                chunk=chunk,
                topics_per_chunk=topics_per_chunk,
                keywords_per_topic=keywords_per_topic,
            )
            if not topics:
                parse_failures += 1
            per_chunk_topics.append(topics)

        merged = merge_topics_frequency_based(
            per_chunk_topics=per_chunk_topics,
            final_k_topics=final_k_topics,
            keywords_per_topic=keywords_per_topic,
        )


        token_usage = {
            "prompt_tokens": int(TOKEN_TRACKER.prompt_tokens),
            "completion_tokens": int(TOKEN_TRACKER.completion_tokens),
            "total_tokens": int(TOKEN_TRACKER.total_tokens),
            "tpm_last_60s": int(TOKEN_TRACKER.tpm()),
            "total_calls": int(TOKEN_TRACKER.total_calls),
            "estimated_calls": int(TOKEN_TRACKER.estimated_calls),
        }

        run = {
            "stage": "ablation_extraction_only",
            "variant_id": v.variant_id,
            "num_chunks": len(doc_chunks),
            "topics_per_chunk": topics_per_chunk,
            "keywords_per_topic": keywords_per_topic,
            "final_k_topics": final_k_topics,
            "token_usage": token_usage,
            "parse_failures": int(parse_failures),
            "num_final_topics": int(len(merged)),
            "components": {
                "role": bool(v.components.role),
                "rubric": bool(v.components.rubric),
                "exemplars": int(len(v.components.exemplars)),
                "delimiter": bool(getattr(v.components.delimiter, "open", "") or getattr(v.components.delimiter, "close", "")),
            },
            "topics": merged,
            "per_chunk_topics": per_chunk_topics,
            "token_usage": {
                "prompt_tokens": TOKEN_TRACKER.prompt_tokens - start_prompt,
                "completion_tokens": TOKEN_TRACKER.completion_tokens - start_completion,
                "total_tokens": TOKEN_TRACKER.total_tokens - start_tokens,
                "tpm_last_60s_end": TOKEN_TRACKER.tpm(),
                "estimated_calls": TOKEN_TRACKER.estimated_calls,
            },
        }

        ts = utc_stamp()
        out_path = output_dir / f"{ts}_{v.variant_id}.json"
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(run, f, ensure_ascii=False, indent=2)

        done.add(v.variant_id)
        save_done(output_dir, done)

        print(f"Token usage ({v.variant_id}): {TOKEN_TRACKER.summary()}")

        rows.append({
            "variant_id": v.variant_id,
            "parse_failures": int(parse_failures),
            "num_final_topics": int(len(merged)),
            "components": {
                "role": bool(v.components.role),
                "rubric": bool(v.components.rubric),
                "exemplars": int(len(v.components.exemplars)),
                "delimiter": bool(getattr(v.components.delimiter, "open", "") or getattr(v.components.delimiter, "close", "")),
            },
            "out_file": str(out_path),
            "status": "saved",
            "total_tokens": int(TOKEN_TRACKER.total_tokens),
            "tpm_last_60s": int(TOKEN_TRACKER.tpm()),
        })

    df = pd.DataFrame(rows)

    manifest_path = output_dir / "manifest.csv"
    if manifest_path.exists():
        old = pd.read_csv(manifest_path)
        all_rows = pd.concat([old, df], ignore_index=True)
    else:
        all_rows = df

    all_rows.to_csv(manifest_path, index=False)

    return df.sort_values(["parse_failures", "variant_id"], ascending=[True, True])

In [33]:
OUTPUT_DIR = Path("outputs/ablation_runs")

manifest_today = run_ablation_next_n(
    llm=llm,
    doc_chunks=doc_chunk,
    base_variant=base_variant,
    topics_per_chunk=TOPICS_PER_CHUNK,
    keywords_per_topic=KEYWORDS_PER_TOPIC,
    final_k_topics=FINAL_K_TOPICS,
    output_dir=OUTPUT_DIR,
    n=1,
)
manifest_today

RuntimeError: Too many rate-limit retries (429).

In [32]:
OUTPUT_DIR = Path("outputs/ablation_runs")

def is_run_file(p: Path) -> bool:
    if p.suffix.lower() != ".json":
        return False
    if p.name in {"done_variants.json"}:
        return False
    if p.name.endswith("_PARTIAL.json"):
        return False
    try:
        obj = json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        return False
    return isinstance(obj, dict) and ("variant_id" in obj) and ("topics" in obj)

run_files = [p for p in OUTPUT_DIR.iterdir() if is_run_file(p)]
run_files = sorted(run_files, key=lambda p: p.stat().st_mtime, reverse=True)

if not run_files:
    print("No run files found in", OUTPUT_DIR)
else:
    p = run_files[0]
    run = json.loads(p.read_text(encoding="utf-8"))
    print("File:", p.name)
    print("Variant:", run.get("variant_id"))
    print("Parse failures:", run.get("parse_failures"))
    print("Final topics:", len(run.get("topics", [])))
    print("Example topic:", (run.get("topics") or [None])[0])
    print("Token usage:", run.get("token_usage"))

File: 20260110_123744_abl_minus_role.json
Variant: abl_minus_role
Parse failures: 0
Final topics: 10
Example topic: {'label': 'Competitive Landscape', 'keywords': ['market share', 'competition', 'market rivals', 'rival offerings', 'competitive dynamics', 'competitive intensity', 'competitors', 'integrated solutions'], 'support_chunks': 5}
Token usage: {'prompt_tokens': 85363, 'completion_tokens': 105715, 'total_tokens': 191078, 'tpm_last_60s_end': 14898, 'estimated_calls': 0}


dict_keys(['stage', 'variant_id', 'num_chunks', 'topics_per_chunk', 'keywords_per_topic', 'final_k_topics', 'token_usage', 'parse_failures', 'num_final_topics', 'components', 'topics', 'per_chunk_topics'])