In [1]:
import re
import json

import Load_Data
import LLM_setting
import Chunking

from dotenv import load_dotenv, find_dotenv
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict, Any , Sequence
from pathlib import Path
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

In [2]:
_ = load_dotenv(find_dotenv())

In [4]:
#LLM Setting
model = "openai/gpt-oss-120b"
temperature = 0
llm_setting= LLM_setting.LLMSetting(MODEL=model, TEMPERATURE=temperature)
llm = LLM_setting.setting(llm_setting)

In [5]:
# Read Data
file_path = Path("data/raw_filings/1326801_000132680125000017_meta-20241231.htm")
cfg = Load_Data.LoadData(file_path=file_path)
docs = Load_Data.load_data(cfg)

In [6]:
# Chunking
chunk_size= 2500
chunk_overlap = 250
data_processing=Chunking.DataProcessing(file_path,chunk_size,chunk_overlap)
doc_chunk =Chunking.chunks(data_processing)

In [7]:
TOPICS_PER_CHUNK = 8
KEYWORDS_PER_TOPIC = 8
FINAL_K_TOPICS = 10

In [8]:
input_text=(
                "We experienced strong net sales growth driven by iPhone and Services. "
                "Supply constraints eased. Foreign exchange impacted revenue. "
                "We repurchased shares and increased the dividend."
            )
output_json={
                "topics": [
                    {"label": "Product Revenue Drivers", "keywords": ["iPhone", "Services", "net sales", "growth", "revenue", "demand", "mix", "products"]},
                    {"label": "Supply Chain Conditions", "keywords": ["supply", "constraints", "inventory", "availability", "production", "lead times", "logistics", "capacity"]},
                    {"label": "Foreign Exchange Effects", "keywords": ["foreign exchange", "currency", "FX", "headwind", "translation", "rates", "impact", "revenue"]},
                    {"label": "Capital Return Program", "keywords": ["share repurchase", "buyback", "dividend", "capital return", "shareholders", "authorization", "cash", "stock"]},
                    {"label": "Operating Performance", "keywords": ["margin", "profitability", "operating income", "expenses", "costs", "efficiency", "performance", "results"]},
                    {"label": "Market Conditions", "keywords": ["macroeconomic", "consumer", "competition", "pricing", "market", "uncertainty", "trends", "demand"]},
                    {"label": "Guidance and Outlook", "keywords": ["outlook", "expectations", "guidance", "future", "forecast", "assumptions", "trend", "risks"]},
                    {"label": "Shareholder Value", "keywords": ["value", "returns", "EPS", "share count", "capital allocation", "liquidity", "cash flow", "investment"]},
                ]
            }

In [9]:
_JSON_OBJ_RE = re.compile(r"\{.*\}", flags=re.DOTALL)

In [10]:
@dataclass(frozen=True)
class Exemplar:
    input_text: str
    output_json: Dict[str, Any]

In [11]:
@dataclass(frozen=True)
class Delimiter:
    open: str = "<TEXT>"
    close: str = "</TEXT>"

In [12]:
@dataclass(frozen=True)
class PromptComponents:
    role: Optional[str]
    instruction: str
    rubric: Optional[str]
    exemplars: Sequence[Exemplar]
    delimiter: Delimiter

In [13]:
@dataclass(frozen=True)
class PromptVariant:
    variant_id: str
    components: PromptComponents
def default_instruction() -> str:
    return "Identify and label the main topics in this corpus."

def default_role_financial_analyst() -> str:
    return "You are a financial analyst."

def default_rubric(topics_per_chunk: int, keywords_per_topic: int) -> str:

    schema_line = 'Schema: {"topics":[{"label":str,"keywords":[str,...]}]}'
    return (
        "Rubric:\n"
        "- Topics should be coherent, specific, and meaningful for financial reporting.\n"
        "- Topic labels should be short noun phrases (avoid generic labels like 'Other').\n"
        "- Topics should be mutually distinct (minimise keyword overlap).\n"
        "- Prefer substantive business/financial themes over boilerplate artifacts.\n"
        "- Avoid pure dates, identifiers, XBRL-like tags, or URL-like tokens as keywords.\n\n"
        "Output constraints:\n"
        "Use ONLY the text inside the delimiter.\n"
        f"Return exactly {topics_per_chunk} topics.\n"
        f"Each topic must have exactly {keywords_per_topic} keywords.\n"
        "Output MUST be valid JSON only (no Markdown, no extra text).\n"
        f"{schema_line}\n"
    )

def default_exemplars(input_text:Any,output_json:Dict) -> List[Exemplar]:
    return [
        Exemplar(
            input_text,
            output_json,
        )
    ]
def build_factorised_prompt(
    variant: PromptVariant,
    chunk_text: str,
    topics_per_chunk: int,
    keywords_per_topic: int,
) -> str:
    c = variant.components

    parts: List[str] = []

    # 1) role
    if c.role:
        parts.append(c.role.strip())

    # 2) instruction
    parts.append(c.instruction.strip())

    # 3) rubric
    if c.rubric:
        parts.append(c.rubric.strip())

    # 4) exemplars
    if c.exemplars:
        ex_lines: List[str] = ["Exemplars:"]
        for i, ex in enumerate(c.exemplars, start=1):
            ex_out = json.dumps(ex.output_json, ensure_ascii=False)
            ex_lines.append(f"[Example {i}]")
            ex_lines.append("Input:")
            ex_lines.append(ex.input_text.strip())
            ex_lines.append("Output (JSON):")
            ex_lines.append(ex_out)
            ex_lines.append("")  # spacer
        parts.append("\n".join(ex_lines).strip())

    # 5) delimiter
    d = c.delimiter
    parts.append(f"{d.open}\n{chunk_text}\n{d.close}")

    return "\n\n".join([p for p in parts if p and p.strip()])

In [14]:
def _as_text(obj: Any) -> str:
    if hasattr(obj, "page_content"):
        return obj.page_content
    if hasattr(obj, "chunk"):
        ch = getattr(obj, "chunk", None)
        if hasattr(ch, "page_content"):
            return ch.page_content
        return str(ch)
    return str(obj)

In [15]:
def _parse_json_loose(s: str) -> Dict[str, Any]:
    s = (s or "").strip()
    if not s:
        return {}
    try:
        return json.loads(s)
    except Exception:
        pass
    m = _JSON_OBJ_RE.search(s)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return {}
    return {}

In [16]:
def extract_topics_for_chunk_factorised(
    llm,
    variant: PromptVariant,
    chunk: Any,
    topics_per_chunk: int,
    keywords_per_topic: int,
) -> List[Dict[str, Any]]:
    text = _as_text(chunk)

    c = variant.components
    rubric = c.rubric
    if rubric is None:
        rubric = default_rubric(topics_per_chunk, keywords_per_topic)
        variant = PromptVariant(
            variant_id=variant.variant_id,
            components=PromptComponents(
                role=c.role,
                instruction=c.instruction,
                rubric=rubric,
                exemplars=c.exemplars,
                delimiter=c.delimiter,
            ),
        )

    prompt = build_factorised_prompt(
        variant=variant,
        chunk_text=text,
        topics_per_chunk=topics_per_chunk,
        keywords_per_topic=keywords_per_topic,
    )

    resp = llm.invoke(prompt)
    content = getattr(resp, "content", str(resp))
    data = _parse_json_loose(content)

    topics = data.get("topics", [])
    if not isinstance(topics, list):
        return []

    cleaned: List[Dict[str, Any]] = []
    for t in topics:
        if not isinstance(t, dict):
            continue
        label = str(t.get("label", "")).strip()
        kws = t.get("keywords", [])
        if not label or not isinstance(kws, list):
            continue
        keywords = [str(k).strip() for k in kws if str(k).strip()]

        if len(keywords) != keywords_per_topic:
            continue
        cleaned.append({"label": label, "keywords": keywords})

    if len(cleaned) != topics_per_chunk:
        return []

    return cleaned


In [17]:
def _norm_label(label: str) -> str:
    label = label.strip().lower()
    label = re.sub(r"\s+", " ", label)
    label = re.sub(r"[^\w\s\-&/]", "", label)
    return label.strip()

In [18]:
def merge_topics_frequency_based(
    per_chunk_topics: List[List[Dict[str, Any]]],
    final_k_topics: int,
    keywords_per_topic: int,
) -> List[Dict[str, Any]]:
    from collections import Counter, defaultdict

    label_support = Counter()
    label_canonical: Dict[str, str] = {}
    kw_counts: Dict[str, Counter] = defaultdict(Counter)

    for chunk_topics in per_chunk_topics:
        for t in chunk_topics:
            label = str(t.get("label", "")).strip()
            if not label:
                continue
            norm = _norm_label(label)
            if not norm:
                continue

            label_support[norm] += 1
            label_canonical.setdefault(norm, label)

            for kw in t.get("keywords", []) or []:
                kw_s = str(kw).strip()
                if kw_s:
                    kw_counts[norm][kw_s] += 1

    ranked_labels = sorted(label_support.items(), key=lambda x: (-x[1], x[0]))
    selected = ranked_labels[: max(1, final_k_topics)]

    merged: List[Dict[str, Any]] = []
    for norm, support in selected:
        top_keywords = [k for k, _ in kw_counts[norm].most_common(keywords_per_topic)]
        merged.append(
            {
                "label": label_canonical.get(norm, norm),
                "keywords": top_keywords,
                "support_chunks": int(support),
            }
        )
    return merged

In [19]:
def run_prompt_space_factorisation(
    *,
    llm,
    doc_chunks: List[Any],
    variant: PromptVariant,
    topics_per_chunk: int,
    keywords_per_topic: int,
    final_k_topics: int,
) -> Dict[str, Any]:
    per_chunk_topics: List[List[Dict[str, Any]]] = []

    for chunk in doc_chunks:
        topics = extract_topics_for_chunk_factorised(
            llm=llm,
            variant=variant,
            chunk=chunk,
            topics_per_chunk=topics_per_chunk,
            keywords_per_topic=keywords_per_topic,
        )
        per_chunk_topics.append(topics)

    merged_topics = merge_topics_frequency_based(
        per_chunk_topics=per_chunk_topics,
        final_k_topics=final_k_topics,
        keywords_per_topic=keywords_per_topic,
    )

    return {
        "stage": "prompt_space_factorisation",
        "variant_id": variant.variant_id,
        "num_chunks": len(doc_chunks),
        "topics_per_chunk": topics_per_chunk,
        "keywords_per_topic": keywords_per_topic,
        "final_k_topics": final_k_topics,
        "topics": merged_topics,
    }

In [20]:
def make_default_factorised_variant(
    *,
    include_exemplars: bool = True,
    delimiter_open: str = "<TEXT>",
    delimiter_close: str = "</TEXT>",
) -> PromptVariant:
    comps = PromptComponents(
        role=default_role_financial_analyst(),
        instruction=default_instruction(),
        rubric=None,  # rendered at call-time with topics_per_chunk/keywords_per_topic
        exemplars=(default_exemplars(input_text,output_json) if include_exemplars else []),
        delimiter=Delimiter(open=delimiter_open, close=delimiter_close),
    )
    return PromptVariant(variant_id="factorised_default", components=comps)


In [20]:
variant = make_default_factorised_variant(include_exemplars=True)
result = run_prompt_space_factorisation(
     llm=llm,
     doc_chunks=doc_chunk,
     variant=variant,
     topics_per_chunk=TOPICS_PER_CHUNK,
     keywords_per_topic=KEYWORDS_PER_TOPIC,
     final_k_topics=FINAL_K_TOPICS,
 )
print(json.dumps(result, indent=2, ensure_ascii=False))

BadRequestError: Error code: 400 - {'error': {'message': 'Please reduce the length of the messages or completion.', 'type': 'invalid_request_error', 'param': 'messages'}}

In [33]:
# save JSON
out_path = Path("outputs") / f"META_prompt_space_factorisation_{FINAL_K_TOPICS}.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Saved: {out_path}")

Saved: outputs/META_prompt_space_factorisation_10.json


## Reproducibility - News

In [21]:
import feedparser
import pandas as pd
from tqdm import tqdm

from __future__ import annotations

import json
import re
import sys
import platform
import hashlib
import importlib
from datetime import datetime, timezone
from importlib import metadata
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

In [22]:
FEEDS = {
    "apple_newsroom": "https://www.apple.com/ca/newsroom/rss-feed.rss",
    "macrumors": "https://feeds.macrumors.com/MacRumors-All",
    "reddit_apple": "https://www.reddit.com/r/apple/.rss",
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (topic-modeling; +contact-email@example.com)"
}

In [23]:
chunk_size = 1000
chunk_overlap = 100

In [24]:
rows = []
for source, feed_url in FEEDS.items():
    d = feedparser.parse(feed_url)
    for e in d.entries:
        rows.append({
            "source": source,
            "title": getattr(e, "title", None),
            "link": getattr(e, "link", None),
            "published": getattr(e, "published", None),
            "summary": getattr(e, "summary", None),
        })

df = pd.DataFrame(rows).drop_duplicates(subset=["link"]).reset_index(drop=True)

In [25]:
news_df = df.drop_duplicates(subset=["link"]).reset_index(drop=True)
news_df["article_id"] = range(len(news_df))

In [26]:
def build_article_document(row) -> str:
    title = (row.get("title") or "").strip()
    summary = (row.get("summary") or "").strip()
    return f"{title}\n\n{summary}".strip()

In [27]:
news_df["document_text"] = news_df.apply(build_article_document, axis=1)
news_df[["article_id", "source", "title"]].head()

Unnamed: 0,article_id,source,title
0,0,apple_newsroom,The new Apple Sainte-Catherine opens today in ...
1,1,apple_newsroom,Popular PC franchise Civilization comes to App...
2,2,apple_newsroom,"Introducing Apple Creator Studio, an inspiring..."
3,3,apple_newsroom,2025 marked a record-breaking year for Apple s...
4,4,apple_newsroom,Stay active in the new year with Apple Watch


In [28]:
# Output folder
RUN_ID = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
OUT_DIR = Path("outputs") / "news" / f"run_{RUN_ID}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [29]:
def _pkg_version(name: str) -> str:
    try:
        return metadata.version(name)
    except Exception:
        return "unknown"

def write_run_metadata(*, out_dir: Path, run_cfg: Dict[str, Any]) -> Path:
    meta = {
        "run_id": RUN_ID,
        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
        "python": sys.version,
        "platform": platform.platform(),
        "packages": {
            "pandas": _pkg_version("pandas"),
            "numpy": _pkg_version("numpy"),
            "langchain": _pkg_version("langchain"),
            "langchain-core": _pkg_version("langchain-core"),
            "langchain-groq": _pkg_version("langchain-groq"),
            "groq": _pkg_version("groq"),
        },
        "config": run_cfg,
    }
    p = out_dir / "run_metadata.json"
    p.write_text(json.dumps(meta, indent=2, ensure_ascii=False), encoding="utf-8")
    return p

In [30]:
def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def _as_text(x: Any) -> str:
    if x is None:
        return ""
    if hasattr(x, "page_content"):
        return str(getattr(x, "page_content") or "")
    return str(x)

In [31]:
# JSON parsing
_JSON_OBJ_RE = re.compile(r"\{.*?\}", flags=re.DOTALL)

def _strip_code_fences(s: str) -> str:
    s = s.strip()
    s = re.sub(r"^\s*```(?:json)?\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*```\s*$", "", s)
    return s.strip()
def _reject_constants(x: str) -> Any:
    raise ValueError(f"Invalid JSON constant: {x}")


In [32]:
def parse_json_loose_to_dict(raw: Any) -> Dict[str, Any]:

    if raw is None:
        return {}

    s = str(raw).strip()
    if not s:
        return {}

    if s.lower() in {"nan", "none", "null"}:
        return {}

    s = _strip_code_fences(s)

    try:
        obj = json.loads(s, parse_constant=_reject_constants)
        return obj if isinstance(obj, dict) else {}
    except Exception:
        pass

    m = _JSON_OBJ_RE.search(s)
    if m:
        try:
            obj = json.loads(m.group(0), parse_constant=_reject_constants)
            return obj if isinstance(obj, dict) else {}
        except Exception:
            return {}

    return {}

In [33]:
def validate_topics_payload(data: Dict[str, Any], *, keywords_per_topic: int) -> List[Dict[str, Any]]:

    topics = data.get("topics", [])
    if not isinstance(topics, list):
        return []

    cleaned: List[Dict[str, Any]] = []
    for t in topics:
        if not isinstance(t, dict):
            continue
        label = str(t.get("label", "")).strip()
        kws = t.get("keywords", [])
        if not label or not isinstance(kws, list):
            continue

        keywords = [str(k).strip() for k in kws if str(k).strip()]
        if len(keywords) != keywords_per_topic:
            continue

        cleaned.append({"label": label, "keywords": keywords})

    return cleaned

In [34]:
def log_jsonl(path: Path, record: Dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

PER_CHUNK_LOG = OUT_DIR / "per_chunk_outputs.jsonl"
PARSE_FAIL_LOG = OUT_DIR / "parse_failures.jsonl"
ARTIFACTS_DIR = OUT_DIR / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

In [35]:
def save_article_chunks(article_id: int, chunks: List[Any]) -> Path:
    chunk_texts = [_as_text(c) for c in chunks]
    rec = {
        "article_id": int(article_id),
        "num_chunks": len(chunk_texts),
        "chunks_sha256": [sha256_text(t) for t in chunk_texts],
        "chunks": chunk_texts,
    }
    p = ARTIFACTS_DIR / f"article_{int(article_id)}_chunks.json"
    p.write_text(json.dumps(rec, ensure_ascii=False, indent=2), encoding="utf-8")
    return p

In [36]:
def extract_topics_for_chunk_factorised_logged(
    *,
    llm,
    variant,
    chunk: Any,
    topics_per_chunk: int,
    keywords_per_topic: int,
    article_id: Optional[int] = None,
    chunk_idx: Optional[int] = None,
) -> List[Dict[str, Any]]:

    text = _as_text(chunk)

    c = variant.components
    rubric = c.rubric
    if rubric is None:
        rubric = default_rubric(topics_per_chunk, keywords_per_topic)
        variant = PromptVariant(
            variant_id=variant.variant_id,
            components=PromptComponents(
                role=c.role,
                instruction=c.instruction,
                rubric=rubric,
                exemplars=c.exemplars,
                delimiter=c.delimiter,
            ),
        )

    prompt = build_factorised_prompt(
        variant=variant,
        chunk_text=text,
        topics_per_chunk=topics_per_chunk,
        keywords_per_topic=keywords_per_topic,
    )

    resp = llm.invoke(prompt)
    raw = getattr(resp, "content", str(resp))

    data = parse_json_loose_to_dict(raw)
    topics = validate_topics_payload(data, keywords_per_topic=keywords_per_topic)
    parsed_ok = bool(topics)

    log_jsonl(
        PER_CHUNK_LOG,
        {
            "article_id": int(article_id) if article_id is not None else None,
            "variant_id": getattr(variant, "variant_id", None),
            "chunk_idx": int(chunk_idx) if chunk_idx is not None else None,
            "chunk_sha256": sha256_text(text),
            "raw_preview": str(raw)[:2000],
            "parsed_ok": parsed_ok,
            "num_topics": len(topics),
        },
    )

    if not parsed_ok:
        log_jsonl(
            PARSE_FAIL_LOG,
            {
                "article_id": int(article_id) if article_id is not None else None,
                "variant_id": getattr(variant, "variant_id", None),
                "chunk_idx": int(chunk_idx) if chunk_idx is not None else None,
                "chunk_sha256": sha256_text(text),
                "raw": str(raw)[:8000],  # bounded, but keeps enough for debugging
            },
        )

    return topics


In [37]:
def run_prompt_space_factorisation_logged(
    *,
    llm,
    doc_chunks: List[Any],
    variant,
    topics_per_chunk: int,
    keywords_per_topic: int,
    final_k_topics: int,
    article_id: Optional[int] = None,
) -> Dict[str, Any]:

    per_chunk_topics: List[List[Dict[str, Any]]] = []
    for i, chunk in enumerate(doc_chunks):
        topics = extract_topics_for_chunk_factorised_logged(
            llm=llm,
            variant=variant,
            chunk=chunk,
            topics_per_chunk=topics_per_chunk,
            keywords_per_topic=keywords_per_topic,
            article_id=article_id,
            chunk_idx=i,
        )
        per_chunk_topics.append(topics)

    merged_topics = merge_topics_frequency_based(
        per_chunk_topics=per_chunk_topics,
        final_k_topics=final_k_topics,
        keywords_per_topic=keywords_per_topic,
    )

    return {
        "stage": "prompt_space_factorisation",
        "variant_id": variant.variant_id,
        "num_chunks": len(doc_chunks),
        "topics_per_chunk": topics_per_chunk,
        "keywords_per_topic": keywords_per_topic,
        "final_k_topics": final_k_topics,
        "topics": merged_topics,
    }


In [38]:
variant = make_default_factorised_variant(include_exemplars=True)

In [39]:
run_cfg = {
    "variant_id": getattr(variant, "variant_id", None),
    "topics_per_chunk": TOPICS_PER_CHUNK,
    "keywords_per_topic": KEYWORDS_PER_TOPIC,
    "final_k_topics": FINAL_K_TOPICS,
     "chunk_size": chunk_size,
     "chunk_overlap": chunk_overlap,
     "model": model,
}
write_run_metadata(out_dir=OUT_DIR, run_cfg=run_cfg)

print(f"Reproducibility logging enabled. Outputs will be written to: {OUT_DIR}")

Reproducibility logging enabled. Outputs will be written to: outputs/news/run_20260122T123459Z


In [40]:
def chunk_one_article(row) -> list[Document]:
    doc = Document(
        page_content=row["document_text"],
        metadata={
            "article_id": int(row["article_id"]),
            "source": row.get("source"),
            "title": row.get("title"),
            "link": row.get("link"),
            "published": row.get("published"),
        },
    )
    return text_splitter.split_documents([doc])

In [42]:
print("model:", llm._default_params.get("model"))
print("max_tokens:", llm._default_params.get("max_tokens"))

model: meta-llama/llama-4-maverick-17b-128e-instruct
max_tokens: None


In [43]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n", "\n", " ", ""],
)

In [44]:
results = []

for _, row in tqdm(news_df.iterrows(), total=len(news_df)):
    article_id = int(row["article_id"])

    doc_chunks = chunk_one_article(row)
    if not doc_chunks:
        continue

    save_article_chunks(article_id, doc_chunks)

    out = run_prompt_space_factorisation_logged(
        llm=llm,
        doc_chunks=doc_chunks,
        variant=variant,
        topics_per_chunk=TOPICS_PER_CHUNK,
        keywords_per_topic=KEYWORDS_PER_TOPIC,
        final_k_topics=FINAL_K_TOPICS,
        article_id=article_id,
    )

    results.append({
        "article_id": article_id,
        "source": row.get("source"),
        "title": row.get("title"),
        "link": row.get("link"),
        "published": row.get("published"),
        "topics_json": out,
    })

len(results), results[0]["topics_json"].keys() if results else (0, [])

100%|██████████| 65/65 [22:24<00:00, 20.68s/it]


(65,
 dict_keys(['stage', 'variant_id', 'num_chunks', 'topics_per_chunk', 'keywords_per_topic', 'final_k_topics', 'topics']))

In [45]:
for t in results[0]["topics_json"]["topics"]:
    print(t)

{'label': 'Brand Presence', 'keywords': ['Apple', 'brand', 'identity', 'image', 'awareness', 'recognition', 'loyalty', 'reputation'], 'support_chunks': 1}
{'label': 'Business Development', 'keywords': ['development', 'growth', 'expansion', 'opportunities', 'investment', 'initiative', 'strategy', 'prospects'], 'support_chunks': 1}
{'label': 'Customer Engagement', 'keywords': ['customers', 'engagement', 'interaction', 'service', 'support', 'community', 'outreach', 'events'], 'support_chunks': 1}
{'label': 'Geographic Expansion', 'keywords': ['Montreal', 'Canada', 'region', 'market', 'expansion', 'presence', 'location', 'site'], 'support_chunks': 1}
{'label': 'Marketing Strategy', 'keywords': ['marketing', 'strategy', 'promotion', 'advertising', 'publicity', 'launch', 'campaign', 'buzz'], 'support_chunks': 1}
{'label': 'Retail Experience', 'keywords': ['reimagined', 'space', 'customers', 'experience', 'shopping', 'environment', 'design', 'layout'], 'support_chunks': 1}
{'label': 'Store Op

In [47]:
final_dir = OUT_DIR / "final"
final_dir.mkdir(parents=True, exist_ok=True)

(final_dir / f"article_{article_id}_final.json").write_text(
    json.dumps(out, indent=2, ensure_ascii=False),
    encoding="utf-8",
)

2284