In [1]:
import os
import re
import sys
from pathlib import Path
from collections import defaultdict

In [2]:

import docx
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Paths
PATH_ACTION_POINTS = Path("Action Points GSS 2025.docx")
PATH_T20 = Path("T20 communique.docx")

# SBERT model 
SBERT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# How many top T20 matches to show for each action point
TOP_K = 3

# Output HTML file
OUT_HTML = Path("gss_t20_similarity.html")

In [4]:
def read_docx_paragraphs(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path.resolve()}")
    doc = docx.Document(str(path))
    paras = [p.text.strip() for p in doc.paragraphs]
    return [p for p in paras if p.strip()]

def looks_like_cluster_header(line: str) -> bool:
    if not line: return False
    if line.startswith("["): return False
    if line.lstrip().startswith("- "): return False
    if re.match(r"^\d+(\.|:)?\s", line): return False
    return True

def parse_gss_clusters(paragraphs):
    clusters = defaultdict(list)
    current_cluster = None
    for line in paragraphs:
        txt = line.strip()
        if not txt: 
            continue
        if looks_like_cluster_header(txt) and not txt.startswith("[]"):
            current_cluster = txt
            _ = clusters[current_cluster]  # ensure key
            continue
        if txt.startswith("[]") or ("[" in txt and "]" in txt):
            ap = re.sub(r"^\s*\[\s*\]\s*", "", txt).strip()
            if not current_cluster:
                current_cluster = "Ungrouped"
            if ap:
                clusters[current_cluster].append(ap)
    # drop empty clusters
    return {k: v for k, v in clusters.items() if v}

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def split_t20_into_sentences(paragraphs):
    sentences = []
    current_section = None
    for raw in paragraphs:
        line = raw.strip()
        if not line:
            continue
        # treat title-ish lines as sections (no period at end)
        if looks_like_cluster_header(line) and not line.endswith("."):
            current_section = line
            continue
        for s in sent_tokenize(line):
            s = normalize_whitespace(s)
            if len(s) >= 3:
                sentences.append({"sentence": s, "section": current_section})
    # dedupe
    seen, out = set(), []
    for row in sentences:
        key = (row["sentence"], row["section"])
        if key not in seen:
            seen.add(key)
            out.append(row)
    return out

def embed_texts(model, texts):
    return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

def build_html(results_by_cluster):
    css = """
    <style>
      body { font-family: system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; margin: 24px; color: #111; }
      h1 { font-size: 28px; margin-bottom: 8px; }
      h2 { font-size: 22px; margin-top: 28px; }
      .note { color: #555; margin-bottom: 24px; }
      table { border-collapse: collapse; width: 100%; margin: 12px 0 28px 0; }
      th, td { border: 1px solid #e5e7eb; padding: 10px 12px; vertical-align: top; }
      th { background: #f8fafc; text-align: left; font-weight: 600; }
      tr:nth-child(even) { background: #fafafa; }
      .ap { font-weight: 600; }
      .section { color: #444; font-size: 12px; margin-top: 4px; }
      .score { font-variant-numeric: tabular-nums; }
      .cluster-badge { display:inline-block; background:#eef2ff; color:#3730a3; padding:4px 8px; border-radius:12px; font-size:12px; margin-left:6px; }
      .footer { margin-top: 40px; color: #666; font-size: 12px; }
    </style>
    """
    html = [f"<!DOCTYPE html><html><head><meta charset='utf-8'><title>GSS–T20 Similarity</title>{css}</head><body>"]
    html.append("<h1>GSS 2025 Action Points ↔ T20 Recommendations</h1>")
    html.append("<div class='note'>Scores are cosine similarities (Sentence-BERT). Range: 0–1.</div>")
    for cluster, df in results_by_cluster.items():
        html.append(f"<h2>{cluster} <span class='cluster-badge'>Top matches</span></h2>")
        html.append("<table>")
        html.append("<tr><th style='width:32%'>Action Point</th><th>Match (T20 Sentence)</th><th style='width:8%'>Rank</th><th style='width:10%'>Score</th></tr>")
        last_ap_idx = None
        for _, r in df.sort_values(["ap_index", "rank"]).iterrows():
            ap_cell = ""
            if r["ap_index"] != last_ap_idx:
                ap_cell = f"<div class='ap'>{r['action_point']}</div>"
                last_ap_idx = r["ap_index"]
            section_badge = f"<div class='section'>Section: {r['t20_section']}</div>" if pd.notna(r['t20_section']) and r['t20_section'] else ""
            html.append(
                "<tr>"
                f"<td>{ap_cell}</td>"
                f"<td>{r['t20_sentence']}{section_badge}</td>"
                f"<td class='score'>{int(r['rank'])}</td>"
                f"<td class='score'>{r['score']:.3f}</td>"
                "</tr>"
            )
        html.append("</table>")
    html.append("<div class='footer'>Model: SentenceTransformer ‘all-MiniLM-L6-v2’. Adjust TOP_K and model in the config cell.</div>")
    html.append("</body></html>")
    return "\n".join(html)

In [5]:
# Open the two Word docs 
gss_paragraphs = read_docx_paragraphs(PATH_ACTION_POINTS)
t20_paragraphs = read_docx_paragraphs(PATH_T20)

print(f"GSS paragraphs: {len(gss_paragraphs)}")
print("GSS sample:", gss_paragraphs[:8])  # preview a few lines
print("-" * 60)
print(f"T20 paragraphs: {len(t20_paragraphs)}")
print("T20 sample:", t20_paragraphs[:8])

GSS paragraphs: 32
GSS sample: ['Human Flourishing', '[] Champion a global narrative shift from economic growth to human flourishing, grounded in equity, sustainability, and dignity.', '[] Reform financial systems to support inclusive, long-term development.', '[] Empower communities to co-create solutions, especially in the Global South, through grassroots innovation and development corridors', '[] Leverage the influence of G20: Use the G20 as a platform for piloting and scaling flourishing-centered models.', 'Climate Action and Sustainability', '[]  Reform global finance systems\xa0to reduce the cost of capital in the Global South, mobilize private investment, and support inclusive, long-term climate and development goals.', '[] Scale community-led and grassroots solutions\xa0by integrating them into national strategies, enabling direct access to funding, and supporting blended finance and SPV models.']
------------------------------------------------------------
T20 paragraphs: 23
T

In [6]:
# Parse clusters and action points 
gss_clusters = parse_gss_clusters(gss_paragraphs)
print(f"Detected clusters: {len(gss_clusters)}")
for k, v in gss_clusters.items():
    print(f" • {k}: {len(v)} action points")
#  first cluster details
first_cluster = next(iter(gss_clusters)) if gss_clusters else None
if first_cluster:
    print("\nFirst cluster preview:", first_cluster)
    for ap in gss_clusters[first_cluster][:3]:
        print(" -", ap)

Detected clusters: 6
 • Human Flourishing: 4 action points
 • Climate Action and Sustainability: 5 action points
 • Digital and AI Transformation: 5 action points
 • Geoeconomics and Trade: 4 action points
 • Global Finance: 4 action points
 • Global Governance and Multilateralism: 4 action points

First cluster preview: Human Flourishing
 - Champion a global narrative shift from economic growth to human flourishing, grounded in equity, sustainability, and dignity.
 - Reform financial systems to support inclusive, long-term development.
 - Empower communities to co-create solutions, especially in the Global South, through grassroots innovation and development corridors


In [7]:
#  Split T20 into sentences 
t20_sent_rows = split_t20_into_sentences(t20_paragraphs)
print(f"T20 sentences: {len(t20_sent_rows)}")
print("T20 sentence sample:")
for r in t20_sent_rows[:5]:
    print(" -", r)

T20 sentences: 62
T20 sentence sample:
 - {'sentence': '1.1.', 'section': 'Trade and Investment'}
 - {'sentence': 'Empower the WTO to preserve and reform the multilateral trading system: The G20 should empower the WTO by strengthening the WTO Secretariat and making additional financial resources available.', 'section': 'Trade and Investment'}
 - {'sentence': 'The decision-making process should also be made more flexible, this will help advance and incorporate pro-development plurilateral agreements, such as the Investment Facilitation for Development (IFD) Agreement, into the WTO Framework.', 'section': 'Trade and Investment'}
 - {'sentence': 'The dispute settlement system needs to be reformed to strengthen deliberative processes and preventive mechanisms, in the interim building on the Multi-Party Interim Appeal Arbitration Arrangement (MPIA).', 'section': 'Trade and Investment'}
 - {'sentence': 'The Generalised System of Preferences (GSP) should be reviewed to provide predictable, lo

In [8]:
# Sentence-BERT model 
model = SentenceTransformer(SBERT_MODEL)
print("Loaded SBERT:", SBERT_MODEL)

# Embed T20 sentences 
t20_sentences = [r["sentence"] for r in t20_sent_rows]
t20_sections = [r["section"] for r in t20_sent_rows]
t20_emb = embed_texts(model, t20_sentences)
print("T20 embedding shape:", t20_emb.shape)

Loaded SBERT: sentence-transformers/all-MiniLM-L6-v2
T20 embedding shape: (62, 384)


  return forward_call(*args, **kwargs)


In [9]:
#  Test one cluster before running all 
test_cluster = first_cluster  
if test_cluster:
    ap_texts = gss_clusters[test_cluster]
    ap_emb = embed_texts(model, ap_texts)
    sims = cosine_similarity(ap_emb, t20_emb)  # [n_ap, n_t20]
    print(f"Testing cluster: {test_cluster} | APs: {len(ap_texts)}")
    # Top matches for the first action point only
    ap_idx = 0
    top_idx = np.argsort(-sims[ap_idx])[:TOP_K]
    print("\nAction Point:", ap_texts[ap_idx])
    for rank, ti in enumerate(top_idx, start=1):
        print(f"  {rank}. ({sims[ap_idx][ti]:.3f})", t20_sentences[ti], "| Section:", t20_sections[ti])

Testing cluster: Human Flourishing | APs: 4

Action Point: Champion a global narrative shift from economic growth to human flourishing, grounded in equity, sustainability, and dignity.
  1. (0.487) Champion comprehensive ‘whole-of society, whole-of-economy’ just transition taxonomies, and exercise political leadership in UNFCCC negotiations to secure concrete outcomes and the means of implementation for just transitions at global and local levels: The G20 should lead on local, national, regional and international just transition policies and taxonomies in line with a ‘whole-of-government, whole-of-society’ approach that pursue, among others, poverty alleviation, social equity and resilience, gender equality and economic empowerment. | Section: Accelerating Climate Action and the Just Energy Transition
  2. (0.453) Operationalise the Global Alliance Against Hunger and Poverty and Build Equitable Food Systems: The G20 must monitor and evaluate the operationalisation of the Global Allianc

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [10]:
# Full run: all clusters, build DataFrame 
all_rows = []
for cluster, ap_list in gss_clusters.items():
    ap_emb = embed_texts(model, ap_list)
    sims = cosine_similarity(ap_emb, t20_emb)
    for ap_idx, ap in enumerate(ap_list):
        top_idx = np.argsort(-sims[ap_idx])[:TOP_K]
        for rank, ti in enumerate(top_idx, start=1):
            all_rows.append({
                "cluster": cluster,
                "ap_index": ap_idx,
                "action_point": ap,
                "rank": rank,
                "t20_sentence": t20_sentences[ti],
                "t20_section": t20_sections[ti],
                "score": float(sims[ap_idx][ti]),
            })

results_df = pd.DataFrame(all_rows)
print("Results size:", results_df.shape)
results_df.head()

  return forward_call(*args, **kwargs)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)


Results size: (78, 7)


  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  ret = a @ b
  ret = a @ b
  ret = a @ b


Unnamed: 0,cluster,ap_index,action_point,rank,t20_sentence,t20_section,score
0,Human Flourishing,0,Champion a global narrative shift from economi...,1,"Champion comprehensive ‘whole-of society, whol...",Accelerating Climate Action and the Just Energ...,0.487384
1,Human Flourishing,0,Champion a global narrative shift from economi...,2,Operationalise the Global Alliance Against Hun...,Solidarity for the Achievement of the Sustaina...,0.453438
2,Human Flourishing,0,Champion a global narrative shift from economi...,3,Take actions to bring down the cost of capital...,Financing for Sustainable Development,0.425814
3,Human Flourishing,1,"Reform financial systems to support inclusive,...",1,Finance SDG Gaps through a Reform of the Globa...,Solidarity for the Achievement of the Sustaina...,0.48355
4,Human Flourishing,1,"Reform financial systems to support inclusive,...",2,"In addition, strengthened South–South and Nort...",Accelerating Climate Action and the Just Energ...,0.435472


In [11]:
#select onlz column 3 and 5 from results_df
results_df_2 = results_df[["action_point", "t20_sentence"]]

#save csv
#results_df_2.to_csv("gss_t20_similarity_results.csv", index=False)

In [12]:
#  HTML report
results_by_cluster = {}
for cluster, df in results_df.groupby("cluster"):
    results_by_cluster[cluster] = df.copy()

html = build_html(results_by_cluster)
OUT_HTML.write_text(html, encoding="utf-8")
print("Wrote HTML:", OUT_HTML.resolve())

Wrote HTML: /Users/fernandaortega/Library/CloudStorage/OneDrive-GlobalSolutionsInitiativeFoundationgGmbH/General/06_Program & Research/Recoupling Dashboard/Final-impact report/gss_t20_similarity.html


### Ontological topics

In [16]:
# %% [markdown]
# GSS 2025 (Action Points) ↔ T20 (Sentence Recommendations)
# (HTML export without Rank/Score; top filter by session)

# %%
# --- 0) Imports ---
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import docx
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# %%
# --- 1) Config: paths, model, knobs ---
# Edit these to your actual file locations
PATH_ACTION_POINTS = Path("Action Points GSS 2025.docx")
PATH_T20           = Path("T20 communique.docx")
OUT_HTML           = Path("gss_t20_similarity_report.html")

SBERT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K       = 3  # top matches per AP

# Topic labeling knobs
TOPIC_THRESHOLD = 0.40   # similarity cutoff to accept a label (lower = more permissive)
TOPIC_TOP_K    = 5       # max semantic labels to keep per text (lexical hits are added regardless)

# %%
# --- 2) Topic Ontology (edit/extend freely) ---
TOPIC_ONTOLOGY: Dict[str, List[str]] = {
    # Human flourishing / social
    "Well-being": [
        "well-being", "wellbeing", "quality of life", "human flourishing",
        "life satisfaction", "multidimensional well-being", "subjective well-being",
        "whole-of-society", "poverty"
    ],
    "Equity": [
        "equity", "equality", "social equity", "fairness",
        "distributional justice", "inclusive growth", "inequalities",
        "asymmetries", "Empower"
    ],
    "Vulnerable populations": [
        "vulnerable groups", "vulnerable populations", "marginalised communities",
        "at-risk groups", "refugees", "displaced persons"
    ],
    "Social protection": [
        "social protection", "safety nets", "social protection floors",
        "cash transfers", "universal access to social services", "protect worker"
    ],
    "Healthcare": ["healthcare", "health services", "health systems"],
    "Education": ["education", "education system", "continuous education"],
    "Gender equality": ["gender equality", "gender-responsive", "women empowerment", "gender and age responsive"],
    "Youth participation": ["youth participation", "youth inclusion", "young people engagement"],
    "Well-Being Economy": [
        "well-being economy", "economy of well-being", "wellbeing economy",
        "shift from economic growth", "whole-of-economy", "long-term development",
        "inclusive governance frameworks", "economic transformation"
    ],

    # Governance / participation / data
    "Inclusive governance": [
        "inclusive governance", "participatory governance", "whole-of-society",
        "whole-of-government", "community-led", "people-centred"
    ],
    "Policy relevance": [
        "policy-relevant", "evidence-informed policy", "policy design", "policy evaluation"
    ],
    "Data governance": [
        "data governance", "integrated data governance", "data interoperability",
        "data standards", "algorithmic transparency", "data rights"
    ],
    "Institutional reform": ["institutional arrangements", "institutional reform", "restructuring systems"],
    "Data rights and privacy": [
        "data rights", "data control", "personal data", "privacy",
        "privacy protection", "data protection", "GDPR"
    ],
    "Algorithmic transparency and accountability": [
        "algorithmic decisions", "algorithmic transparency", "algorithmic accountability",
        "explainable AI", "responsible AI", "AI accountability"
    ],
    "Open-source and civic tech": [
        "open-source", "civic tech", "nonprofit platforms", "small developers", "public interest technology"
    ],
    "Digital public infrastructure (DPI)": [
        "digital public infrastructure", "DPI", "people-first digital infrastructure",
        "whole-of-society digital infrastructure", "sovereign digital systems"
    ],
    "Digital inclusion": [
        "digital inclusion", "digital inequalities", "digital divide", "offline access", "ai literacy"
    ],
    "AI and human flourishing": [
        "AI literacy", "reskilling", "worker rights",
        "responsible AI", "human-centred AI", "AI for human capabilities"
    ],
    "Ethical tech and digital governance": [
        "ethical tech", "tech safeguards", "ethical technology use",
        "participatory digital governance", "civil society in digital governance"
    ],
    "Digital sovereignty and interoperability": [
        "digital sovereignty", "sovereign digital systems", "global interoperability", "cross-border digital governance"
    ],
    "Digital economy and competition": [
        "digital economy", "antitrust", "competition policy",
        "collective bargaining", "platform power", "tech monopolies"
    ],
    "Data as a public good": [
        "data as a public asset", "data commons", "community data", "public data governance"
    ],
    "Evidence-based digital policy": [
        "digital statistics", "digital policy", "data for policy", "evidence-based digital policy", "digital metrics"
    ],
    "Global digital cooperation": [
        "international cooperation on digital", "global digital forums",
        "digital technology for sustainable development", "global digital governance"
    ],
    "AI literacy and skills": [
        "AI literacy", "reskilling", "digital skills", "inclusive reskilling", "AI education", "capacity-building for AI"
    ],
    "Responsible and human-centred AI": [
        "responsible AI", "human-centred AI", "AI for human flourishing",
        "AI for human capabilities", "ethical AI"
    ],
    "AI governance and accountability": [
        "AI governance", "AI regulation", "algorithmic accountability", "algorithmic transparency", "AI oversight"
    ],
    "Digital inclusion and equity": [
        "digital inclusion", "digital inequalities", "equitable digital access",
        "AI for inclusion", "support local innovation"
    ],
    "Skills development and capacity building": [
        "skills development", "capacity-building", "education systems", "reskilling programs",
        "knowledge sharing", "human capital", "strengthen capacity", "inclusive reskilling", "co-create solutions"
    ],
    "Technology transfer and cooperation": [
        "technology transfer", "co-development of technology"
    ],
    "Social protection and digitalisation": [
        "social protection systems", "universal access to services",
        "healthcare and education access", "digitalisation and welfare"
    ],

    # Climate / environment / transitions
    "Climate finance": [
        "climate finance", "UNFCCC", "just transitions",
        "climate-aligned investment", "low-carbon investment"
    ],
    "Just transition": [
        "just transition", "transition taxonomies", "just transition policies",
        "whole-of-society approach", "whole-of-government approach", "community-led", "just transitions"
    ],
    "Climate risk": [
        "climate risk", "environmental risks", "biodiversity collapse",
        "climate shocks", "climate vulnerability"
    ],
    "Biodiversity": ["biodiversity"],
    "Bioeconomy": [
        "bioeconomy", "climate-biodiversity nexus",
        "nature-based solutions", "circular materials", "circular economy",
        "nature positive", "nature credits", "ecosystem restoration", "nature finance"
    ],
    "Resilience": [
        "resilience", "social resilience", "adaptive capacity",
        "community resilience", "disaster preparedness"
    ],
    "Critical minerals": [
        "critical minerals", "critical mineral value chains",
        "fair benefit-sharing", "equitable green industrialisation"
    ],
    "Green industrialisation": [
        "green industrialisation", "low-carbon development",
        "industrial policy for green transition", "value chain integration"
    ],
    "Infrastructure transition": [
        "energy grids", "digital infrastructure", "social infrastructure", "transport networks"
    ],
    "Nature finance": [
        "nature finance", "nature credits", "biodiversity finance",
        "environmental finance", "sustainable finance for ecosystems"
    ],
    "Food systems": [
        "food systems", "equitable food systems", "hunger and poverty",
        "agri-food systems", "sustainable food production"
    ],
    "Regional cooperation": [
        "regional integration", "intra-regional trade", "regional cooperation",
        "regional collaboration", "regional platforms", "regional partnerships",
        "north-south collaboration", "south-south collaboration"
    ],
    "Collaboration and partnerships": ["collaboration", "partnerships", "joint efforts"],

    "International cooperation": [
    "international cooperation", "global cooperation", "multilateral cooperation",
    "bilateral cooperation", "international partnerships", "global governance",
    "cross-border collaboration", "transnational cooperation", 
    "international platforms", "international alliances"
    ],


    

    # Finance / debt / investment
    "Finance reform": [
        "financial system reform", "reform global finance", "financial institutions reform",
        "sustainable finance", "reform the multilateral trading system"
    ],
    "Debt relief": ["debt relief", "debt swaps", "debt resolution", "debt sustainability"],
    "Reduce cost of capital": [
        "reduce cost of capital", "lower cost of capital", "cost of capital reduction", "cost of capital"
    ],
    "Progressive taxation": ["progressive fiscal policies", "wealth tax", "tax cooperation", "UN Tax Convention"],
    "SDG financing": ["finance SDG gaps", "SDG financing"],

    # Trade / industrial policy / minerals
    "Trade facilitation": [
        "trade rules", "trade cooperation", "interoperability of trade systems", "standards harmonisation"
    ],
    "Industrial policy and value addition": [
        "industrial policy", "value addition", "domestic industrial development", "industrial upgrading"
    ],
    "Trade and sustainability": ["trade and sustainability", "environmental standards in trade"],

    # Infrastructure / innovation / tech
    "Infrastructure": ["infrastructure", "energy grids", "transport networks", "social infrastructure", "digital infrastructure"],
    "Innovation & AI": ["ai tools", "data systems", "traceability systems", "technology co-development"],
    "Circular economy": ["circular materials", "circular economy"],

    "Private sector participation": [
        "private capital", "mobilize private capital", "private sector investment",
        "public-private partnerships", "PPP", "PPPs", "blended finance",
        "crowding in private finance", "private sector participation"
    ],
    "Finance and investment mechanisms": [
        "finance mechanisms", "financial instruments", "blended finance",
        "impact bonds", "sustainable finance instruments", "outcome-focused finance",
        "green bonds", "debt-for-nature swaps", "debt-for-SDG swaps"
    ],

    "Sustainability Standards & Measurement": [
        "biodiversity measurement", "minerals governance framework",
        "monitoring and reporting", "standards"
    ],

    # Participation / accountability
    "Accountability & participation": ["accountability", "participation", "benefit-sharing", "community-led", "feedback loops"],

    # Empowerment / leadership
    "Economic empowerment": [
        "economic empowerment", "empower communities", "transfer and co-development", "strengthen capacity in the Global South"
    ],
    "G20 Global Leadership": ["influence of G20", "G20 should", "G20 must"]
}

# %%
# --- 3) Utilities to read docs and parse ---
def read_docx_paragraphs(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path.resolve()}")
    doc = docx.Document(str(path))
    paras = [p.text.strip() for p in doc.paragraphs]
    return [p for p in paras if p.strip()]

def looks_like_cluster_header(line: str) -> bool:
    if not line:
        return False
    if line.startswith("["):
        return False
    if line.lstrip().startswith("- "):
        return False
    if re.match(r"^\d+(\.|:)?\s", line):
        return False
    return True

def parse_gss_clusters(paragraphs):
    clusters = defaultdict(list)
    current_cluster = None
    for line in paragraphs:
        txt = line.strip()
        if not txt:
            continue
        if looks_like_cluster_header(txt) and not txt.startswith("[]"):
            current_cluster = txt
            _ = clusters[current_cluster]
            continue
        if txt.startswith("[]") or ("[" in txt and "]" in txt):
            ap = re.sub(r"^\s*\[\s*\]\s*", "", txt).strip()
            if not current_cluster:
                current_cluster = "Ungrouped"
            if ap:
                clusters[current_cluster].append(ap)
    return {k: v for k, v in clusters.items() if v}

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def split_t20_into_sentences(paragraphs):
    sentences = []
    current_section = None
    for raw in paragraphs:
        line = raw.strip()
        if not line:
            continue
        # treat title-ish lines as sections (no period at end)
        if looks_like_cluster_header(line) and not line.endswith("."):
            current_section = line
            continue
        for s in sent_tokenize(line):
            s = normalize_whitespace(s)
            if len(s) >= 3:
                sentences.append({"sentence": s, "section": current_section})
    # dedupe
    seen, out = set(), []
    for row in sentences:
        key = (row["sentence"], row["section"])
        if key not in seen:
            seen.add(key)
            out.append(row)
    return out

# %%
# --- 4) Embedding helpers ---
def embed_texts(model, texts):
    return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

def build_topic_label_embeddings(model, ontology: Dict[str, List[str]]) -> Dict[str, np.ndarray]:
    """
    Build a single embedding per label by averaging its seed phrase embeddings,
    then re-normalize the mean vector to unit length (so dot==cosine).
    """
    label_embs = {}
    for label, seeds in ontology.items():
        seed_embs = embed_texts(model, seeds)  # already normalized
        mean_vec  = seed_embs.mean(axis=0)
        norm = np.linalg.norm(mean_vec)
        if norm > 0:
            mean_vec = mean_vec / norm
        label_embs[label] = mean_vec
    return label_embs

# %%
# --- 5) Lexical fallback (regex) ---
def build_lexicon_index(ontology: Dict[str, List[str]]):
    """
    Compile regex for each seed phrase for fast, case-insensitive matching.
    Uses word boundaries; also handles simple hyphen/space variants.
    """
    idx = {}
    for label, seeds in ontology.items():
        pats = []
        for s in seeds:
            s_esc = re.escape(s).replace(r"\-", r"[-\s]")  # "well-being" ~ "well being"
            pats.append(rf"(?i)\b{s_esc}\b")
        idx[label] = [re.compile(p) for p in pats]
    return idx

def lexical_hits(text: str, lex_index) -> List[str]:
    hits = []
    for label, regs in lex_index.items():
        if any(r.search(text) for r in regs):
            hits.append(label)
    return hits

# %%
# --- 6) Topic assignment (semantic + lexical union) ---
def assign_topics(
    texts: List[str],
    model,
    label_embs: Dict[str, np.ndarray],
    threshold: float = TOPIC_THRESHOLD,
    top_k: int = TOPIC_TOP_K,
    lex_index=None
) -> Tuple[List[List[str]], np.ndarray]:
    labels = list(label_embs.keys())
    label_mat = np.vstack([label_embs[l] for l in labels])   # (L, dim), normalized
    text_embs = embed_texts(model, texts)                    # (N, dim), normalized
    sims = text_embs @ label_mat.T                           # cosine similarities

    topics_per_text = []
    for i, txt in enumerate(texts):
        # 1) semantic picks by cosine
        idx_sorted = np.argsort(-sims[i])
        sem = []
        for j in idx_sorted:
            if sims[i, j] >= threshold:
                sem.append(labels[j])
            if len(sem) >= top_k:
                break

        # 2) lexical picks (guaranteed if literal seed appears)
        lex = lexical_hits(txt, lex_index) if lex_index is not None else []

        # 3) union with semantic-first order, then lexical remainders
        seen = set(sem)
        merged = sem + [l for l in lex if l not in seen]

        topics_per_text.append(merged)
    return topics_per_text, sims

# %%
# --- 7) HTML builder (no rank/score; with top filter by cluster) ---
def build_html(results_by_cluster):
    clusters = list(results_by_cluster.keys())

    css = """
    <style>
      :root { --border:#e5e7eb; --muted:#555; --muted2:#444; --bg:#f8fafc; --badge:#eef2ff; --badgeText:#3730a3; }
      body { font-family: system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; margin: 24px; color: #111; }
      h1 { font-size: 28px; margin-bottom: 8px; }
      h2 { font-size: 22px; margin: 28px 0 8px; }
      table { border-collapse: collapse; width: 100%; margin: 12px 0 28px 0; }
      th, td { border: 1px solid var(--border); padding: 10px 12px; vertical-align: top; }
      th { background: var(--bg); text-align: left; font-weight: 600; }
      tr:nth-child(even) { background: #fafafa; }
      .ap { font-weight: 600; }
      .subtle { color: var(--muted); font-size: 12px; margin-top: 4px; }
      .section { color: var(--muted2); font-size: 12px; margin-top: 4px; }
      .cluster-badge { display:inline-block; background:var(--badge); color:var(--badgeText); padding:4px 8px; border-radius:12px; font-size:12px; margin-left:6px; }
      .chip { display:inline-block; padding:2px 8px; margin:2px 4px 0 0; border:1px solid var(--border); border-radius:999px; font-size:12px; background:var(--bg); }
      .toolbar { display:flex; gap:12px; align-items:center; padding:12px; border:1px solid var(--border); border-radius:12px; background:#fff; margin: 12px 0 20px; position:sticky; top:0; z-index:5; }
      .toolbar label { font-size:14px; color:#111; }
      .toolbar select { padding:8px 10px; border:1px solid var(--border); border-radius:10px; }
      .toolbar button { padding:8px 12px; border:1px solid var(--border); border-radius:10px; background:#111; color:#fff; cursor:pointer; }
      .toolbar button:hover { opacity:0.9; }
      .footer { margin-top: 40px; color: #666; font-size: 12px; }
      section.cluster { margin-bottom: 28px; }
    </style>
    """

    js = """
    <script>
      function applyClusterFilter() {
        var sel = document.getElementById('clusterSelect').value;
        var secs = document.querySelectorAll('section.cluster');
        secs.forEach(function(sec){
          if (sel === 'ALL' || sec.dataset.cluster === sel) {
            sec.style.display = '';
          } else {
            sec.style.display = 'none';
          }
        });
        window.scrollTo({top:0, behavior:'smooth'});
      }
      document.addEventListener('DOMContentLoaded', function(){
        var sel = document.getElementById('clusterSelect');
        if (sel) {
          sel.addEventListener('keydown', function(e){
            if (e.key === 'Enter') applyClusterFilter();
          });
        }
      });
    </script>
    """

    html = [f"<!DOCTYPE html><html><head><meta charset='utf-8'><title>GSS–T20 Similarity</title>{css}{js}</head><body>"]
    html.append("<h1>GSS 2025 Action Points ↔ T20 Recommendations</h1>")

    # Toolbar with cluster filter
    html.append("<div class='toolbar'>")
    html.append("<label for='clusterSelect'>Filter by GSS cluster:</label>")
    html.append("<select id='clusterSelect' aria-label='Filter by GSS session'>")
    html.append("<option value='ALL'>Show all</option>")
    for c in clusters:
        c_opt = str(c).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
        html.append(f"<option value='{c_opt}'>{c_opt}</option>")
    html.append("</select>")
    html.append("<button onclick='applyClusterFilter()'>Apply</button>")
    html.append("</div>")

    # Cluster sections (no Rank/Score columns)
    for cluster, df in results_by_cluster.items():
        cluster_safe = str(cluster).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
        html.append(f"<section class='cluster' data-cluster='{cluster_safe}'>")
        html.append(f"<h2>{cluster_safe} <span class='cluster-badge'>Top matches</span></h2>")
        html.append("<table>")
        html.append("<tr><th style='width:34%'>GSS Action Point</th><th>Match (T20 Recommendation)</th><th style='width:24%'>Overlap topics</th></tr>")

        last_ap_idx = None
        for _, r in df.sort_values(["ap_index", "rank"]).iterrows():
            ap_cell = ""
            if r["ap_index"] != last_ap_idx:
                ap_cell = f"<div class='ap'>{r['action_point']}</div>"
                if 'ap_topics' in r and isinstance(r['ap_topics'], str) and r['ap_topics'].strip():
                    ap_cell += f"<div class='subtle'>AP topics: {r['ap_topics']}</div>"
                last_ap_idx = r["ap_index"]

            # T20 cell with section + topics
            t20_cell = f"{r['t20_sentence']}"
            section_badge = (
                f"<div class='section'>Section: {r['t20_section']}</div>"
                if pd.notna(r['t20_section']) and r['t20_section'] else ""
            )
            t20_cell += section_badge
            if 't20_topics' in r and isinstance(r['t20_topics'], str) and r['t20_topics'].strip():
                t20_cell += f"<div class='subtle'>T20 topics: {r['t20_topics']}</div>"

            # overlap chips
            chips = ""
            if 'overlap_topics' in r and isinstance(r['overlap_topics'], str) and r['overlap_topics'].strip():
                chips = "".join([f"<span class='chip'>{t.strip()}</span>" for t in r['overlap_topics'].split(",") if t.strip()])

            html.append(
                "<tr>"
                f"<td>{ap_cell}</td>"
                f"<td>{t20_cell}</td>"
                f"<td>{chips}</td>"
                "</tr>"
            )
        html.append("</table>")
        html.append("</section>")

    html.append("<div class='footer'>Model: SentenceTransformer ‘all-MiniLM-L6-v2’.</div>")
    html.append("</body></html>")
    return "\n".join(html)

# %%
# --- 8) Read inputs ---
gss_paragraphs = read_docx_paragraphs(PATH_ACTION_POINTS)
t20_paragraphs = read_docx_paragraphs(PATH_T20)

# %%
# --- 9) Parse clusters and action points (checkpoint) ---
gss_clusters = parse_gss_clusters(gss_paragraphs)
print(f"Detected clusters: {len(gss_clusters)}")
for k, v in gss_clusters.items():
    print(f" • {k}: {len(v)} action points")

first_cluster = next(iter(gss_clusters)) if gss_clusters else None
if first_cluster:
    print("\nFirst cluster preview:", first_cluster)
    for ap in gss_clusters[first_cluster][:3]:
        print(" -", ap)

# %%
# --- 10) Split T20 into sentences (checkpoint) ---
t20_sent_rows = split_t20_into_sentences(t20_paragraphs)
print(f"T20 sentences: {len(t20_sent_rows)}")
print("T20 sentence sample:")
for r in t20_sent_rows[:5]:
    print(" -", r)

# %%
# --- 11) Load Sentence-BERT model + build ontology embeddings + lexical index ---
model = SentenceTransformer(SBERT_MODEL)
print("Loaded SBERT:", SBERT_MODEL)

label_embeddings = build_topic_label_embeddings(model, TOPIC_ONTOLOGY)
lex_index = build_lexicon_index(TOPIC_ONTOLOGY)
print(f"Built label embeddings for {len(label_embeddings)} topics; lexical index ready.")

# %%
# --- 12) Embed T20 sentences + label topics (checkpoint) ---
t20_sentences = [r["sentence"] for r in t20_sent_rows]
t20_sections  = [r["section"]  for r in t20_sent_rows]
t20_emb = embed_texts(model, t20_sentences)
print("T20 embedding shape:", t20_emb.shape)

t20_topics, _t20_topic_sims = assign_topics(
    t20_sentences, model, label_embeddings,
    threshold=TOPIC_THRESHOLD, top_k=TOPIC_TOP_K,
    lex_index=lex_index
)
print("T20 topics sample:", list(zip(t20_sentences[:3], t20_topics[:3])))

# %%
# --- 13) Full run: all clusters, build DataFrame (checkpoint) ---
all_rows = []
for cluster, ap_list in gss_clusters.items():
    # topics for APs in this cluster
    ap_topics, _ap_topic_sims = assign_topics(
        ap_list, model, label_embeddings,
        threshold=TOPIC_THRESHOLD, top_k=TOPIC_TOP_K,
        lex_index=lex_index
    )

    # embeddings + sims
    ap_emb = embed_texts(model, ap_list)
    sims = cosine_similarity(ap_emb, t20_emb)

    for ap_idx, ap in enumerate(ap_list):
        top_idx = np.argsort(-sims[ap_idx])[:TOP_K]
        for rank, ti in enumerate(top_idx, start=1):
            overlap = sorted(set(ap_topics[ap_idx]) & set(t20_topics[ti]))
            all_rows.append({
                "cluster": cluster,
                "ap_index": ap_idx,
                "action_point": ap,
                "ap_topics": ", ".join(ap_topics[ap_idx]),
                "rank": rank,  # kept for sorting only; not displayed in HTML
                "t20_sentence": t20_sentences[ti],
                "t20_section": t20_sections[ti],
                "t20_topics": ", ".join(t20_topics[ti]),
                "overlap_topics": ", ".join(overlap),
                "score": float(sims[ap_idx][ti]),  # kept for potential diagnostics
            })

results_df = pd.DataFrame(all_rows)
print("Results size:", results_df.shape)
print(results_df.head())

# %%
# --- 14) Build & write HTML report ---
results_by_cluster = {c: df.copy() for c, df in results_df.groupby("cluster")}
html = build_html(results_by_cluster)
OUT_HTML.write_text(html, encoding="utf-8")
print("Wrote HTML:", OUT_HTML.resolve())


Detected clusters: 6
 • Human Flourishing: 4 action points
 • Climate Action and Sustainability: 5 action points
 • Digital and AI Transformation: 5 action points
 • Geoeconomics and Trade: 4 action points
 • Global Finance: 4 action points
 • Global Governance and Multilateralism: 4 action points

First cluster preview: Human Flourishing
 - Champion a global narrative shift from economic growth to human flourishing, grounded in equity, sustainability, and dignity.
 - Reform financial systems to support inclusive, long-term development.
 - Empower communities to co-create solutions, especially in the Global South, through grassroots innovation and development corridors
T20 sentences: 62
T20 sentence sample:
 - {'sentence': '1.1.', 'section': 'Trade and Investment'}
 - {'sentence': 'Empower the WTO to preserve and reform the multilateral trading system: The G20 should empower the WTO by strengthening the WTO Secretariat and making additional financial resources available.', 'section': '

  return forward_call(*args, **kwargs)


Built label embeddings for 63 topics; lexical index ready.
T20 embedding shape: (62, 384)
T20 topics sample: [('1.1.', []), ('Empower the WTO to preserve and reform the multilateral trading system: The G20 should empower the WTO by strengthening the WTO Secretariat and making additional financial resources available.', ['G20 Global Leadership', 'International cooperation', 'Finance reform', 'Economic empowerment', 'Trade facilitation', 'Equity']), ('The decision-making process should also be made more flexible, this will help advance and incorporate pro-development plurilateral agreements, such as the Investment Facilitation for Development (IFD) Agreement, into the WTO Framework.', ['International cooperation', 'Policy relevance', 'Green industrialisation'])]
Results size: (78, 10)
             cluster  ap_index  \
0  Human Flourishing         0   
1  Human Flourishing         0   
2  Human Flourishing         0   
3  Human Flourishing         1   
4  Human Flourishing         1   

 

  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  sims = text_embs @ label_mat.T                           # cosine similarities
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return forward_call(*args, **kwargs)
  sims = text_embs @ label_m