
Unified News/SNS Issue Clustering & Ranking Pipeline 
--------------------------------------------------------------------------

핵심 기능
- 광고/홍보 드랍(drop_ads) + 이모지/URL 비율 기반 드랍(drop_by_ratio)
- 카테고리별(개인정보/중대재해/아동복지/금융) 시맨틱 필터(PRIV_SEEDS cosine)
- UMAP(가변) + HDBSCAN 파라미터 스윕(최적 조합 선택) + KMeans 폴백
- 거대 군집 재귀 분할(refine_recursively) + 이질 군집 자동 분할(split_if_hetero)
- 키워드: 확장 불용어 + n-gram(1~2) 기반 c-TF-IDF + NPMI/KL + SBERT MMR 재랭킹
- 메트릭: DBCV(가능 시), 실루엣, 노이즈, 유사도 분리마진 등
- 정책 관련성(policy seed cosine) 기반 리랭킹 옵션 (salience와 가중 결합)
- 결과: Excel/JSON/CSV

필수 패키지
```pip install pandas numpy scikit-learn sentence-transformers umap-learn hdbscan openpyxl tqdm```


In [None]:

from __future__ import annotations
import argparse
import json
import math
import re
import sys
import random
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import MinMaxScaler

import umap
import hdbscan
from hdbscan.validity import validity_index
from sentence_transformers import SentenceTransformer

# ----------------------------
# Fixed categories for this project
# ----------------------------
CATEGORY_CHOICES = {"개인정보", "중대재해", "아동복지", "금융"}

def validate_category(cat: str) -> str:
    if cat not in CATEGORY_CHOICES and cat not in {"AUTO", "ALL"}:
        raise ValueError(f"[category] must be one of {sorted(CATEGORY_CHOICES)} or 'AUTO'/'ALL' (got: {cat})")
    return cat

# ----------------------------
# Domain-tuned seeds for News & Social Media (고도화 시드)
# ----------------------------

PRIVACY_SEED_BY_CATEGORY = {
    "개인정보": [
        "개인정보","수집","처리","제3자","동의","파기","보관","정보통신망",
        "쿠키","식별자","위치정보","얼굴인식","신용정보","유출",
    ],
    "금융": [
        "금융","투자","예금","대출","보험","신용","카드","주식",
        "자산","펀드","리스크","불완전판매","이자","수수료","금리",
        "가상자산","암호화폐","전자금융","송금","결제","사기","보이스피싱",
    ],
    "아동복지": [
        "아동","청소년","아동학대","보호","양육","돌봄","시설","입양",
        "아동권리","방임","아동폭력","성착취","아동음란물","학교폭력",
        "복지","가정폭력","신고","보호자","피해아동",
    ],
    "중대재해": [
        "산업재해","중대재해","안전관리","근로자","노동자","사망사고",
        "현장","건설","제조","안전점검","위험요인","책임","경영책임자",
        "안전보건","사고","부상","예방","노동안전","산재","감독",
    ],
}

POLICY_SEED_BY_CATEGORY = {
    "개인정보": [
        "개인정보보호법","정보통신망법","통신비밀보호","위치정보법","신용정보법",
        "개인정보 처리방침","동의","제3자 제공","파기","보관",
        "침해사고","유출","익명화","가명처리","쿠키","식별자","얼굴인식","프로파일링",
    ],
    "금융": [
        "자본시장법","특정금융정보법","전자금융거래법","전자증권법","금융소비자보호법",
        "보험업법","은행법","여신전문금융업법","금융지주회사법","외국환거래법",
        "가상자산이용자보호법","불공정거래","내부통제","금융감독","신용정보법",
        "전자지급수단","핀테크","P2P금융","불완전판매","사기방지",
    ],
    "아동복지": [
        "아동복지법","아동학대범죄처벌법","아동청소년성보호법","소년법","청소년보호법",
        "가정폭력방지법","입양특례법","아동수당법","보육법","아동권리보장원",
        "학교폭력예방법","아동보호전문기관","신고의무","보호명령","피해아동보호",
    ],
    "중대재해": [
        "중대재해처벌법","산업안전보건법","근로기준법","산재보상보험법",
        "건설기술진흥법","화학물질관리법","시설안전법","재난안전관리법",
        "산업안전보건기준","안전보건경영체계","경영책임자처벌","노동안전보건",
        "위험성평가","안전점검","사고보고","안전관리계획","안전규제",
    ],
}

def get_privacy_seeds_by_category(category: str) -> List[str]:
    category = validate_category(category)
    if category in {"AUTO","ALL"}:
        raise ValueError("get_privacy_seeds_by_category requires a concrete category, not AUTO/ALL")
    return PRIVACY_SEED_BY_CATEGORY[category]

def get_policy_seeds_by_category(category: str) -> List[str]:
    category = validate_category(category)
    if category in {"AUTO","ALL"}:
        raise ValueError("get_policy_seeds_by_category requires a concrete category, not AUTO/ALL")
    return POLICY_SEED_BY_CATEGORY[category]

# ----------------------------
# Utils
# ----------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

def log(msg: str):
    print(f"[INFO] {msg}")

# ----------------------------
# Text building & tokenization
# ----------------------------
_KO_JOSA = {
    "은","는","이","가","을","를","과","와","도","만","까지","부터","에서",
    "으로","로","처럼","보다","에게","께","께서","한테","께서는","하며","하며도",
    "의","이나","나","든지","라도","마저","조차","마냥","께로","만큼","같이",
}

_DEFAULT_STOPWORDS = {
    "하다","되다","있다","없다","같다","때문","통해","대한","그리고","하지만","또한",
    "그","이","저","것","수","등","더","만","및","http","https","www","com","co","kr",
}

_EXTRA_STOPWORDS = {
    "드립니다","드릴","상황이신가요","이야기하겠습니다","변호사입니다","합니다","되고있습니다",
    "입니다","있습니다","하세요","해주세요","관련하여","관해","대해","위해","때문에","이에",
    "그리고","오늘은","안녕하세요","소개합니다","말씀드리","살펴보","정리해","고생많으십니다",
    "공지","공지사항","리그램","repost","리포스트","리뷰","후기","감사드립니다","감사합니다",
}

_TOKEN_PATTERN = re.compile(r"[A-Za-z가-힣0-9]{2,}")

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"https?://\S+", " ", s)
    s = re.sub(r"\S+@\S+", " ", s)
    s = re.sub(r"[^\w가-힣 ]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s

def tokenize_ko(s: str, extra_stop: set[str] | None = None) -> List[str]:
    stop = set(_DEFAULT_STOPWORDS)
    if extra_stop:
        stop |= set(extra_stop)
    s = normalize_text(s)
    toks = _TOKEN_PATTERN.findall(s.lower())
    out = []
    for t in toks:
        if t in stop:
            continue
        if re.search(r"[가-힣]", t):
            for j in [2, 1]:
                if len(t) > j and t[-j:] in _KO_JOSA:
                    t = t[:-j]
                    break
        if t and t not in stop and len(t) >= 2:
            out.append(t)
    return out

def build_text(row: pd.Series, title_cols: List[str], body_cols: List[str]) -> str:
    vals = []
    for c in title_cols + body_cols:
        if c in row and isinstance(row[c], str) and row[c].strip():
            vals.append(row[c])
    return " \n".join(vals)

# ----------------------------
# NEW: Excel category -> 4개 카테고리 매핑
# ----------------------------
_FINANCE_TOKENS = ["자본시장법","특정금융정보법","전자금융거래법","전자증권법","금융소비자보호법"]

def map_excel_category_to_cat4(val: Any) -> str | None:
    """
    입력 엑셀의 'category' 문자열을 4개 고정 카테고리 중 하나로 매핑.
    - 개인정보: '개인정보보호법' 또는 '정보통신망법' 포함
    - 금융: _FINANCE_TOKENS 중 하나라도 포함
    - 아동복지: '아동복지법' 포함
    - 중대재해: '중대재해처벌법' 포함
    """
    if not isinstance(val, str) or not val.strip():
        return None
    s = val.strip()
    if ("개인정보보호법" in s) or ("정보통신망법" in s):
        return "개인정보"
    if any(tok in s for tok in _FINANCE_TOKENS):
        return "금융"
    if "아동복지법" in s:
        return "아동복지"
    if "중대재해처벌법" in s:
        return "중대재해"
    return None

# ----------------------------
# Ad / spam drop (rules) + ratio filters
# ----------------------------
AD_PATTERNS = [
    r"(문의|구매|입금|택배|주문|협찬|스폰|체험단|광고|AD\b|event)",
    r"(카톡|kakao|라인|LINE|문자)\s*[:：]?\s*[@a-zA-Z0-9_]+",
    r"(☏|☎|전화|010[- ]?\d{4}[- ]?\d{4})",
    r"(할인|특가|프로모션|구독|구매링크|바로가기)",
    r"(쿠폰|이벤트\s*중|선착순|증정|리뷰|당첨)",
    r"(후기|리그램|repost|리포스트|협찬|앵콜공구|기획전|예약|영업시간|라스트오더|연중무휴|방문감사|고객님|테이블에\s*부착.*큐알코드)",
]

def drop_ads(df: pd.DataFrame, text_col="__text__") -> pd.DataFrame:
    pat = re.compile("|".join(AD_PATTERNS), flags=re.IGNORECASE)
    mask = ~df[text_col].fillna("").astype(str).str.contains(pat)
    kept = df[mask].copy()
    log(f"Ad/Spam dropped: {len(df) - len(kept)} rows removed")
    return kept

def drop_by_ratio(df: pd.DataFrame, col="__text__", max_emoji_ratio=0.15, max_url_ratio=0.05) -> pd.DataFrame:
    s = df[col].fillna("").astype(str)
    url_cnt = s.str.count(r"https?://")
    emoji_cnt = s.str.count(r"[^\w\s.,!?@#:/\-]")
    length = s.str.len().replace(0, 1)
    mask = (emoji_cnt/length <= max_emoji_ratio) & (url_cnt/length <= max_url_ratio)
    kept = df[mask].copy()
    log(f"Ratio-based dropped: {len(df) - len(kept)} rows removed")
    return kept

# ----------------------------
# Embedding
# ----------------------------
def embed_with_model(model: SentenceTransformer, texts: List[str],
                     batch_size: int = 64, normalize: bool = True) -> np.ndarray:
    embs = model.encode(texts, batch_size=batch_size, show_progress_bar=True,
                        normalize_embeddings=normalize)
    return embs.astype(np.float32)

# ----------------------------
# Dim reduction & clustering helpers
# ----------------------------
@dataclass
class ClusterResult:
    labels: np.ndarray
    probs: np.ndarray
    clusterer: Any

def kmeans_fallback(X: np.ndarray, k_range: Tuple[int, int] = (5, 20),
                    batch_kmeans: bool = True, random_state: int = 42) -> ClusterResult:
    ks = list(range(k_range[0], k_range[1] + 1))
    best_k, best_score, best_model = None, -1.0, None
    for k in ks:
        km = MiniBatchKMeans(n_clusters=k, random_state=random_state, batch_size=2048) if batch_kmeans else \
             __import__("sklearn.cluster").cluster.KMeans(n_clusters=k, random_state=random_state)
        labels = km.fit_predict(X)
        if len(set(labels)) <= 1 or len(set(labels)) >= len(labels):
            continue
        try:
            score = silhouette_score(X, labels, metric='cosine')
        except Exception:
            score = -1.0
        if score > best_score:
            best_k, best_score, best_model = k, score, km
    if best_model is None:
        labels = np.full(X.shape[0], -1)
        probs = np.zeros(X.shape[0])
        return ClusterResult(labels, probs, None)
    labels = best_model.predict(X)
    probs = np.ones(X.shape[0]) * 0.8
    return ClusterResult(labels, probs, best_model)

def try_hdbscan_grid(X: np.ndarray, grid: List[Tuple[int,int]],
                     metric: str = "euclidean", random_state: int = 42) -> dict | None:
    best = None
    for mcs, ms in grid:
        cl = hdbscan.HDBSCAN(
            min_cluster_size=mcs, min_samples=ms,
            metric=metric, prediction_data=True, core_dist_n_jobs=1
        )
        labels = cl.fit_predict(X)
        if len(set(labels) - {-1}) < 2:
            continue
        try:
            dbcv = validity_index(X[labels != -1], labels[labels != -1])
        except Exception:
            dbcv = -1.0
        noise = float(np.mean(labels == -1))
        score = 0.6 * dbcv + 0.4 * (1.0 - noise)
        cand = dict(model=cl, labels=labels,
                    probs=getattr(cl, "probabilities_", np.ones(len(labels))*0.7),
                    dbcv=dbcv, noise=noise, mcs=mcs, ms=ms, score=score)
        if best is None or cand["score"] > best["score"]:
            best = cand
    return best

def split_cluster_once(embs: np.ndarray, idx: np.ndarray, seed=42) -> np.ndarray | None:
    if len(idx) < 30:
        return None
    reducer = umap.UMAP(n_components=15, metric="cosine", random_state=seed)
    sub = reducer.fit_transform(embs[idx])
    cand = try_hdbscan_grid(sub, grid=[(6,3), (8,4), (10,5)], metric="euclidean", random_state=seed)
    if cand is None or len(set(cand["labels"]) - {-1}) < 2:
        return None
    return cand["labels"]

def refine_recursively(embs: np.ndarray, labels: np.ndarray, max_rounds=2, seed=42) -> np.ndarray:
    current = labels.copy()
    for _ in range(max_rounds):
        changed = False
        for c in sorted([l for l in set(current) if l != -1]):
            idx = np.where(current == c)[0]
            sub_labels = split_cluster_once(embs, idx, seed=seed)
            if sub_labels is None:
                continue
            base = int(current.max()) + 1
            for i, li in zip(idx, sub_labels):
                current[i] = -1 if li == -1 else base + int(li)
            changed = True
        if not changed:
            break
    return current

def split_if_hetero(embs: np.ndarray, labels: np.ndarray, split_fn, thresh: float = 0.30, min_size: int = 20, seed: int = 42) -> np.ndarray:
    cur = labels.copy()
    for c in sorted([l for l in set(cur) if l != -1]):
        idx = np.where(cur == c)[0]
        if len(idx) < min_size: 
            continue
        E = embs[idx]
        S = E @ E.T
        iu = np.triu_indices_from(S, 1)
        if len(iu[0]) == 0:
            continue
        hetero = 1.0 - float(np.mean(S[iu]))
        if hetero <= thresh:
            continue
        sub = split_fn(embs, idx, seed=seed)
        if sub is None or len(set(sub) - {-1}) < 2:
            continue
        base = int(cur.max()) + 1
        for i, li in zip(idx, sub):
            cur[i] = -1 if li == -1 else base + int(li)
    return cur

# ----------------------------
# Hashtag pre-bucketing (optional)
# ----------------------------
def initial_hashtag_bucket(df: pd.DataFrame, col="__text__", topk=200, k: int | None = None, seed: int = 42) -> np.ndarray:
    tags = df[col].fillna("").astype(str).str.findall(r"#([A-Za-z가-힣0-9_]+)").apply(lambda x: [t.lower() for t in x])
    from collections import Counter
    vocab = [t for t,_ in Counter([t for ts in tags for t in ts]).most_common(topk)]
    if not vocab:
        return np.zeros(len(df), dtype=int)
    vocab_index = {t:i for i,t in enumerate(vocab)}
    mat = np.zeros((len(df), len(vocab)), dtype=np.uint8)
    for i, ts in enumerate(tags):
        for t in ts:
            j = vocab_index.get(t)
            if j is not None:
                mat[i, j] = 1
    if mat.sum() == 0:
        return np.zeros(len(df), dtype=int)
    if k is None:
        k = min(30, max(4, len(df)//200))
    km = MiniBatchKMeans(n_clusters=k, random_state=seed, batch_size=2048)
    return km.fit_predict(mat)

# ----------------------------
# Keyword extraction
# ----------------------------
@dataclass
class KeywordInfo:
    term: str
    ctfidf: float
    npmi: float
    kl: float
    score: float

def compute_c_tf_idf_per_cluster_ng(
    docs_tokens: List[List[str]],
    labels: np.ndarray,
    min_df: int = 2,
    ngram_range: Tuple[int,int] = (1,2)
) -> Tuple[Dict[int, Dict[str, float]], Dict[str, int]]:
    docs_str = [" ".join(toks) for toks in docs_tokens]
    vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range)
    X = vectorizer.fit_transform(docs_str)
    terms = np.array(vectorizer.get_feature_names_out())

    unique_labels = sorted([l for l in set(labels) if l != -1])
    if not unique_labels:
        return {}, {t: i for i, t in enumerate(terms)}

    class_term = np.zeros((len(unique_labels), len(terms)), dtype=np.float64)
    label_to_idx = {l: i for i, l in enumerate(unique_labels)}
    for i, l in enumerate(labels):
        if l == -1:
            continue
        row = X[i].toarray().ravel()
        class_term[label_to_idx[l]] += row

    tf_c = class_term
    class_len = tf_c.sum(axis=1, keepdims=True) + 1e-9
    tf_norm = tf_c / class_len
    df_terms = (tf_c > 0).sum(axis=0)
    idf = np.log((len(unique_labels) + 1) / (df_terms + 1)) + 1
    ctfidf_mat = tf_norm * idf

    per_cluster = {}
    for l in unique_labels:
        idx = label_to_idx[l]
        weights = ctfidf_mat[idx]
        d = {terms[j]: float(weights[j]) for j in np.where(weights > 0)[0]}
        per_cluster[l] = d

    term_index = {t: i for i, t in enumerate(terms)}
    return per_cluster, term_index

def compute_npmi_and_kl(
    docs_tokens: List[List[str]],
    labels: np.ndarray,
    term_index: Dict[str,int]
) -> Tuple[Dict[int, Dict[str, float]], Dict[int, Dict[str, float]]]:
    n_docs = len(docs_tokens)
    df_global: Dict[str, int] = {}
    for toks in docs_tokens:
        for w in set(toks):
            df_global[w] = df_global.get(w, 0) + 1

    clusters = sorted([l for l in set(labels) if l != -1])
    df_cluster: Dict[int, Dict[str, int]] = {l: {} for l in clusters}
    n_docs_cluster: Dict[int, int] = {l: 0 for l in clusters}

    for i, l in enumerate(labels):
        if l == -1: continue
        n_docs_cluster[l] += 1
        for w in set(docs_tokens[i]):
            d = df_cluster[l]
            d[w] = d.get(w, 0) + 1

    npmi_per: Dict[int, Dict[str, float]] = {l: {} for l in clusters}
    kl_per: Dict[int, Dict[str, float]] = {l: {} for l in clusters}

    for c in clusters:
        Pc = n_docs_cluster[c] / max(n_docs, 1)
        for w, dfc in df_cluster[c].items():
            Pw  = df_global.get(w, 0) / max(n_docs, 1)
            Pwc = dfc / max(n_docs, 1)
            if Pwc <= 0 or Pw <= 0 or Pc <= 0:
                continue
            pmi = math.log(Pwc / (Pw * Pc))
            denom = -math.log(Pwc)
            npmi = pmi / denom if denom > 0 else 0.0
            npmi_per[c][w] = float(npmi)

    for c in clusters:
        total_c = sum(df_cluster[c].values()) + 1e-9
        for w, dfc in df_cluster[c].items():
            Pw_c = dfc / total_c
            Pw   = df_global.get(w, 0) / max(sum(df_global.values()), 1)
            if Pw_c > 0 and Pw > 0:
                kl = Pw_c * math.log(Pw_c / Pw)
                kl_per[c][w] = float(kl)

    return npmi_per, kl_per

def select_keywords_primary(
    ctfidf: Dict[int, Dict[str, float]],
    npmi: Dict[int, Dict[str, float]],
    kl: Dict[int, Dict[str, float]],
    top_k: int = 30,
    w_ctfidf: float = 0.6,
    w_npmi: float = 0.25,
    w_kl: float = 0.15,
) -> Dict[int, List[KeywordInfo]]:
    out: Dict[int, List[KeywordInfo]] = {}
    for c, wt in ctfidf.items():
        terms = set(wt.keys()) | set(npmi.get(c, {}).keys()) | set(kl.get(c, {}).keys())
        infos = []
        for t in terms:
            s = (w_ctfidf * wt.get(t, 0.0)) + (w_npmi * npmi.get(c, {}).get(t, 0.0)) + (w_kl * kl.get(c, {}).get(t, 0.0))
            infos.append(KeywordInfo(t, wt.get(t, 0.0), npmi.get(c, {}).get(t, 0.0), kl.get(c, {}).get(t, 0.0), s))
        infos.sort(key=lambda x: x.score, reverse=True)
        out[c] = infos[:top_k]
    return out

def rerank_keywords_sbert(
    keywords_per_cluster: Dict[int, List[KeywordInfo]],
    sbert_model: SentenceTransformer,
    top_m: int = 40,
    final_k: int = 15,
    lambda_div: float = 0.5
) -> Dict[int, List[KeywordInfo]]:
    out: Dict[int, List[KeywordInfo]] = {}
    for c, cand in keywords_per_cluster.items():
        if not cand:
            out[c] = []
            continue
        pool = cand[:min(top_m, len(cand))]
        terms = [k.term for k in pool]
        vecs = sbert_model.encode(terms, normalize_embeddings=True)
        centroid = (vecs.mean(axis=0) / (np.linalg.norm(vecs.mean(axis=0)) + 1e-9))
        sims_q = vecs @ centroid
        selected_idx: List[int] = []
        remaining = list(range(len(terms)))
        while remaining and len(selected_idx) < min(final_k, len(remaining)):
            if not selected_idx:
                i = int(np.argmax(sims_q[remaining]))
            else:
                sel_vecs = vecs[selected_idx]
                sim_to_sel = np.max(sel_vecs @ vecs[remaining].T, axis=0)
                mmr = (1 - lambda_div) * sims_q[remaining] - lambda_div * sim_to_sel
                i = int(np.argmax(mmr))
            chosen = remaining[i]
            selected_idx.append(chosen)
            remaining.remove(chosen)
        out[c] = [pool[i] for i in selected_idx]
    return out

def extract_keywords_pipeline(
    texts: List[str],
    labels: np.ndarray,
    sbert_model: SentenceTransformer,
    min_df: int = 2,
    ngram_range: Tuple[int,int] = (1,2),
    primary_top_k: int = 30,
    final_k: int = 15,
    lambda_div: float = 0.5,
) -> Dict[int, List[KeywordInfo]]:
    tokens_list = [tokenize_ko(t, extra_stop=_EXTRA_STOPWORDS) for t in tqdm(texts, desc="Tokenizing")]
    ctfidf, term_index = compute_c_tf_idf_per_cluster_ng(tokens_list, labels, min_df=min_df, ngram_range=ngram_range)
    npmi, kl = compute_npmi_and_kl(tokens_list, labels, term_index)
    primary = select_keywords_primary(ctfidf, npmi, kl, top_k=primary_top_k)
    final = rerank_keywords_sbert(primary, sbert_model, top_m=min(primary_top_k, 40), final_k=final_k, lambda_div=lambda_div)
    return final

# ----------------------------
# Metrics
# ----------------------------
def compute_global_metrics(X_2d: np.ndarray, labels: np.ndarray, probs: np.ndarray, clusterer) -> Dict[str, Any]:
    mask = labels != -1
    unique_clusters = sorted([l for l in set(labels) if l != -1])
    n_clusters = len(unique_clusters)
    noise_ratio = float(np.mean(labels == -1))
    sil = None
    try:
        if np.sum(mask) > 2 and n_clusters > 1:
            sil = float(silhouette_score(X_2d[mask], labels[mask], metric='euclidean'))
    except Exception:
        sil = None
    dbcv = None
    try:
        if isinstance(clusterer, hdbscan.HDBSCAN) and np.sum(mask) > 2 and n_clusters > 1:
            dbcv = float(validity_index(X_2d[mask], labels[mask]))
    except Exception:
        dbcv = None
    mean_prob = float(np.mean(probs[mask])) if np.any(mask) else 0.0
    persistence_mean = None
    persistence_median = None
    if isinstance(clusterer, hdbscan.HDBSCAN) and hasattr(clusterer, 'cluster_persistence_'):
        if len(clusterer.cluster_persistence_) > 0:
            persistence_mean = float(np.mean(clusterer.cluster_persistence_))
            persistence_median = float(np.median(clusterer.cluster_persistence_))
    return {
        "n_clusters": n_clusters,
        "noise_ratio": noise_ratio,
        "silhouette": sil,
        "dbcv": dbcv,
        "mean_prob": mean_prob,
        "persistence_mean": persistence_mean,
        "persistence_median": persistence_median,
    }

def cluster_similarity_metrics(embs: np.ndarray, labels: np.ndarray) -> Dict[int, Dict[str, float]]:
    out: Dict[int, Dict[str, float]] = {}
    clusters = sorted([l for l in set(labels) if l != -1])
    for c in clusters:
        idx = np.where(labels == c)[0]
        if len(idx) < 2:
            out[c] = {"intra_sim_mean": 1.0, "inter_sim_max": 0.0, "separation_margin": 1.0}
            continue
        E = embs[idx]
        S = (E @ E.T)
        iu = np.triu_indices_from(S, k=1)
        intra = float(np.mean(S[iu])) if len(iu[0]) > 0 else 1.0
        other = np.where(labels != c)[0]
        inter_max = float(np.max(embs[other] @ E.T)) if len(other) > 0 else 0.0
        sep = float(intra - inter_max)
        out[c] = {"intra_sim_mean": intra, "inter_sim_max": inter_max, "separation_margin": sep}
    return out

def keyword_cohesion_diversity(keywords: Dict[int, List[KeywordInfo]] ) -> Dict[int, Dict[str, float]]:
    out: Dict[int, Dict[str, float]] = {}
    for c, kws in keywords.items():
        terms = [k.term for k in kws]
        diversity = len(set(terms)) / max(len(terms), 1)
        top = kws[:10]
        if top:
            sc = np.array([x.score for x in top])
            if np.ptp(sc) > 0:
                sc = (sc - sc.min()) / (sc.max() - sc.min())
            cohesion = float(sc.mean())
        else:
            cohesion = 0.0
        out[c] = {"keyword_diversity": diversity, "keyword_npmi": cohesion}
    return out

# ----------------------------
# Salience & policy relevance
# ----------------------------
def compute_growth(dates: pd.Series, labels: np.ndarray, window_days: int = 7) -> Dict[int, float]:
    if dates is None or dates.isnull().all():
        return {c: 0.0 for c in set(labels) if c != -1}
    try:
        dt = pd.to_datetime(dates, errors='coerce')
    except Exception:
        return {c: 0.0 for c in set(labels) if c != -1}
    maxd = dt.max()
    if pd.isna(maxd):
        return {c: 0.0 for c in set(labels) if c != -1}
    recent_start = maxd - pd.Timedelta(days=window_days)
    prev_start = recent_start - pd.Timedelta(days=window_days)
    g: Dict[int, float] = {}
    for c in set(labels):
        if c == -1:
            continue
        mask_c = labels == c
        cnt_recent = int(((dt >= recent_start) & (dt <= maxd) & mask_c).sum())
        cnt_prev = int(((dt >= prev_start) & (dt < recent_start) & mask_c).sum())
        if cnt_prev == 0:
            growth = 1.0 if cnt_recent > 0 else 0.0
        else:
            growth = (cnt_recent - cnt_prev) / cnt_prev
        g[c] = float(growth)
    return g

def compute_engagement(df: pd.DataFrame, labels: np.ndarray, like_col: str | None, comment_col: str | None,
                        share_col: str | None) -> Dict[int, float]:
    cols = [c for c in [like_col, comment_col, share_col] if c and c in df.columns]
    if not cols:
        return {c: 0.0 for c in set(labels) if c != -1}
    vals = df[cols].fillna(0).astype(float).mean(axis=1).values
    out: Dict[int, float] = {}
    for c in set(labels):
        if c == -1:
            continue
        m = labels == c
        out[c] = float(np.mean(vals[m])) if np.any(m) else 0.0
    return out

def compute_salience(labels: np.ndarray, growth: Dict[int, float], engagement: Dict[int, float],
                     kw_stats: Dict[int, Dict[str, float]], sim_stats: Dict[int, Dict[str, float]],
                     w=(0.35, 0.25, 0.20, 0.10, 0.10)) -> Dict[int, float]:
    clusters = sorted([c for c in set(labels) if c != -1])
    freq = {c: int(np.sum(labels == c)) for c in clusters}
    kw_cohesion = {c: kw_stats.get(c, {}).get("keyword_npmi", 0.0) for c in clusters}
    sep_margin = {c: sim_stats.get(c, {}).get("separation_margin", 0.0) for c in clusters}
    def norm_dict(d: Dict[int, float]) -> Dict[int, float]:
        vals = np.array([d[c] for c in clusters], dtype=float)
        if np.all(vals == 0):
            return {c: 0.0 for c in clusters}
        scaler = MinMaxScaler()
        vals = scaler.fit_transform(vals.reshape(-1, 1)).ravel()
        return {c: float(vals[i]) for i, c in enumerate(clusters)}
    freq_n = norm_dict(freq)
    growth_n = norm_dict(growth)
    eng_n = norm_dict(engagement)
    coh_n = norm_dict(kw_cohesion)
    sep_n = norm_dict(sep_margin)
    wf, wg, we, wk, ws = w
    sal: Dict[int, float] = {}
    for c in clusters:
        sal[c] = (
            wf * freq_n[c] +
            wg * growth_n[c] +
            we * eng_n[c] +
            wk * coh_n[c] +
            ws * sep_n[c]
        )
    return sal

def build_seed_vec(model: SentenceTransformer, seeds: List[str]) -> np.ndarray:
    vec = np.mean(model.encode(seeds, normalize_embeddings=True), axis=0)
    return (vec / (np.linalg.norm(vec) + 1e-9)).astype(np.float32)

def filter_by_privacy_semantics(texts: List[str], model: SentenceTransformer, seed_vec: np.ndarray,
                                thresh: float = 0.15) -> np.ndarray:
    vecs = model.encode(texts, normalize_embeddings=True)
    sims = vecs @ seed_vec
    return sims >= thresh

# ----------------------------
# Main pipeline
# ----------------------------
def run_pipeline_for_df(
    df_in: pd.DataFrame,
    base_name: str,
    title_cols: List[str],
    body_cols: List[str],
    date_col: str | None,
    like_col: str | None,
    comment_col: str | None,
    share_col: str | None,
    model: SentenceTransformer,
    model_name: str,
    umap_n_components: int,
    umap_metric: str,
    umap_n_neighbors: int,
    umap_min_dist: float,
    hdb_metric: str,
    hdb_grid: List[Tuple[int,int]] | None,
    kmeans_min_k: int,
    kmeans_max_k: int,
    keyword_top_k: int,
    keyword_min_df: int,
    mmr_k_docs: int,
    mmr_lambda: float,
    growth_window_days: int,
    refine_rounds: int,
    hetero_split_thresh: float,
    hetero_min_size: int,
    enable_ad_filter: bool,
    enable_ratio_filter: bool,
    emoji_ratio: float,
    url_ratio: float,
    enable_privacy_filter: bool,
    privacy_thresh: float,
    enable_hashtag_bucket: bool,
    enable_policy_relevance_rerank: bool,
    policy_rerank_weight: float,
    seed: int,
    category: str,
):
    assert category in CATEGORY_CHOICES, f"run_pipeline_for_df expects concrete category, got {category}"

    df = df_in.copy()
    log(f"[category]={category} | rows={len(df)}")

    # 1) Build text
    df['__text__'] = df.apply(lambda r: build_text(r, title_cols, body_cols), axis=1)
    df['__text__'] = df['__text__'].fillna("").astype(str)

    # 2) Filters
    if enable_ad_filter:
        df = drop_ads(df, text_col="__text__")
    if enable_ratio_filter:
        df = drop_by_ratio(df, "__text__", max_emoji_ratio=emoji_ratio, max_url_ratio=url_ratio)
    texts = df['__text__'].tolist()

    # 3) Embedding 
    embs = embed_with_model(model, texts, normalize=True)

    # 4) Seeds
    priv_seeds = get_privacy_seeds_by_category(category)
    policy_seeds = get_policy_seeds_by_category(category)
    log(f"[category]={category} | PRIV={len(priv_seeds)} terms, POLICY={len(policy_seeds)} terms")

    # 5) Privacy semantic filter (옵션)
    if enable_privacy_filter:
        log(f"Applying privacy semantic filter (thresh={privacy_thresh})...")
        priv_seed_vec = build_seed_vec(model, priv_seeds)
        keep_mask = filter_by_privacy_semantics(texts, model, priv_seed_vec, thresh=privacy_thresh)
        kept = int(np.sum(keep_mask))
        log(f"Privacy keep: {kept}/{len(df)}")
        df = df.loc[keep_mask].reset_index(drop=True)
        embs = embs[keep_mask]
        texts = [texts[i] for i, b in enumerate(keep_mask) if b]

    # 6) Optional hashtag pre-bucketing
    if enable_hashtag_bucket:
        log("Building initial hashtag buckets...")
        buckets = initial_hashtag_bucket(df, col="__text__", topk=200, k=None, seed=seed)
        df["hashtag_bucket"] = buckets

    # 7) UMAP
    log(f"UMAP reducing (components={umap_n_components}, neighbors={umap_n_neighbors}, min_dist={umap_min_dist})...")
    reducer = umap.UMAP(
        n_components=umap_n_components, metric=umap_metric,
        n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, random_state=seed
    )
    X_umap = reducer.fit_transform(embs)

    # 8) HDBSCAN grid search
    if hdb_grid is None:
        hdb_grid = [(8,4), (10,5), (12,6), (15,5), (15,8), (20,10)]
    log("HDBSCAN grid search...")
    best = try_hdbscan_grid(X_umap, grid=hdb_grid, metric=hdb_metric, random_state=seed)

    if best is not None and len(set(best["labels"]) - {-1}) >= 2:
        labels, probs, clusterer = best["labels"], best["probs"], best["model"]
        log(f"Chosen HDBSCAN: min_cluster_size={best['mcs']} min_samples={best['ms']} "
            f"dbcv={best['dbcv']:.3f} noise={best['noise']:.3f}")
    else:
        log("HDBSCAN produced too few clusters. Falling back to MiniBatchKMeans...")
        km_res = kmeans_fallback(X_umap, (kmeans_min_k, kmeans_max_k), batch_kmeans=True, random_state=seed)
        labels, probs, clusterer = km_res.labels, km_res.probs, km_res.clusterer

    # 9) Refinements
    if refine_rounds and refine_rounds > 0:
        log(f"Refining clusters recursively (rounds={refine_rounds})...")
        labels = refine_recursively(embs, labels, max_rounds=refine_rounds, seed=seed)

    if hetero_split_thresh is not None and hetero_min_size > 0:
        log(f"Splitting heterogeneous clusters (thresh={hetero_split_thresh}, min_size={hetero_min_size})...")
        labels = split_if_hetero(embs, labels, split_fn=split_cluster_once,
                                 thresh=hetero_split_thresh, min_size=hetero_min_size, seed=seed)

    df['cluster'] = labels
    clusters = sorted([l for l in set(labels) if l != -1])

    # 10) Keywords
    log("Extracting keywords (n-gram + NPMI/KL + SBERT MMR)...")
    keywords = extract_keywords_pipeline(
        texts=texts,
        labels=labels,
        sbert_model=model,
        min_df=keyword_min_df,
        ngram_range=(1,2),
        primary_top_k=max(30, keyword_top_k + 10),
        final_k=keyword_top_k,
        lambda_div=0.5,
    )

    # 11) Representative docs via MMR
    log("Selecting representative documents (MMR)...")
    mmr_lambda = mmr_lambda  # keep variable
    rep_docs: Dict[int, List[int]] = {}
    for c in clusters:
        idx = np.where(labels == c)[0]
        if len(idx) == 0:
            rep_docs[c] = []
            continue
        centroid = np.mean(embs[idx], axis=0)
        centroid /= (np.linalg.norm(centroid) + 1e-9)
        sims_q = (embs[idx] @ centroid.reshape(-1,1)).ravel()
        selected: List[int] = []
        remaining = list(range(len(idx)))
        while remaining and len(selected) < min(mmr_k_docs, len(idx)):
            if not selected:
                i = int(np.argmax(sims_q[remaining]))
                chosen = remaining[i]
            else:
                sel_vecs = embs[idx][selected]
                sim_to_sel = np.max(sel_vecs @ embs[idx][remaining].T, axis=0)
                mmr_scores = (1 - mmr_lambda) * sims_q[remaining] - mmr_lambda * sim_to_sel
                i = int(np.argmax(mmr_scores))
                chosen = remaining[i]
            selected.append(chosen)
            remaining.remove(chosen)
        rep_docs[c] = [int(idx[i]) for i in selected]

    # 12) Metrics
    log("Computing metrics...")
    reducer2d = umap.UMAP(n_components=2, metric=umap_metric, random_state=seed)
    X2 = reducer2d.fit_transform(embs)

    global_metrics = compute_global_metrics(X2, labels, probs=np.ones_like(labels, dtype=float), clusterer=clusterer)
    sim_metrics = cluster_similarity_metrics(embs, labels)
    kw_stats = keyword_cohesion_diversity(keywords)

    growth = compute_growth(df[date_col] if date_col and date_col in df.columns else None,
                            labels, window_days=growth_window_days)
    engagement = compute_engagement(df, labels, like_col, comment_col, share_col)

    salience = compute_salience(labels, growth, engagement, kw_stats, sim_metrics,
                                w=(0.35, 0.25, 0.20, 0.10, 0.10))

    # 13) Policy relevance (옵션)
    policy_relevance = {}
    if enable_policy_relevance_rerank:
        pol_seed_vec = build_seed_vec(model, get_policy_seeds_by_category(category))
        for c in clusters:
            idx = np.where(labels == c)[0]
            if len(idx) == 0:
                policy_relevance[c] = 0.0
                continue
            cent = embs[idx].mean(0); cent /= (np.linalg.norm(cent)+1e-9)
            policy_relevance[c] = float(cent @ pol_seed_vec)

    # 14) Assemble tables
    rows = []
    for c in clusters:
        kw_list = ", ".join([k.term for k in keywords.get(c, [])])
        evid_idx = rep_docs.get(c, [])
        evid_titles = []
        for i in evid_idx:
            title = None
            for tcol in title_cols:
                if tcol in df.columns and isinstance(df.loc[i, tcol], str) and df.loc[i, tcol].strip():
                    title = df.loc[i, tcol]
                    break
            if not title:
                title = df.loc[i, '__text__'][:120]
            evid_titles.append(title)
        row = {
            'cluster': c,
            'size': int(np.sum(labels == c)),
            'keywords': kw_list,
            'rep_titles': evid_titles,
            'intra_sim_mean': sim_metrics.get(c, {}).get('intra_sim_mean', None),
            'inter_sim_max': sim_metrics.get(c, {}).get('inter_sim_max', None),
            'separation_margin': sim_metrics.get(c, {}).get('separation_margin', None),
            'keyword_npmi': kw_stats.get(c, {}).get('keyword_npmi', None),
            'keyword_diversity': kw_stats.get(c, {}).get('keyword_diversity', None),
            'growth': growth.get(c, 0.0),
            'engagement': engagement.get(c, 0.0),
            'salience': salience.get(c, 0.0),
            'policy_relevance': policy_relevance.get(c, 0.0) if enable_policy_relevance_rerank else None,
        }
        rows.append(row)
    table_df = pd.DataFrame(rows)

    # 15) Rerank by policy relevance (옵션)
    if enable_policy_relevance_rerank:
        sc = MinMaxScaler()
        table_df["relevance_n"] = sc.fit_transform(table_df[["policy_relevance"]].fillna(0.0))
        table_df["salience_n"]  = sc.fit_transform(table_df[["salience"]].fillna(0.0))
        w_rel = min(max(policy_rerank_weight, 0.0), 1.0)
        table_df["final_rank_score"] = (1.0 - w_rel)*table_df["salience_n"] + w_rel*table_df["relevance_n"]
        table_df = table_df.sort_values("final_rank_score", ascending=False).reset_index(drop=True)
    else:
        table_df = table_df.sort_values('salience', ascending=False).reset_index(drop=True)

    # Save
    top5 = table_df.head(5).copy()
    top5.to_csv(f"{base_name}_issue_with_rank5.csv", index=False, encoding='utf-8-sig')

    cat_metrics = pd.DataFrame([global_metrics])
    cat_metrics.to_csv(f"{base_name}_cat_metrics.csv", index=False, encoding='utf-8-sig')

    outputs = {
        "global_metrics": global_metrics,
        "clusters": table_df.to_dict(orient='records'),
        "category": category,
        "privacy_seed_count": len(get_privacy_seeds_by_category(category)),
        "policy_seed_count": len(get_policy_seeds_by_category(category)),
    }
    with open(f"{base_name}_outputs.json", "w", encoding="utf-8") as f:
        json.dump(outputs, f, ensure_ascii=False, indent=2)

    with pd.ExcelWriter(f"{base_name}_table.xlsx", engine="openpyxl") as w:
        table_df.to_excel(w, index=False, sheet_name="clusters")
        evid_records = []
        for c in clusters:
            for rank, i in enumerate(rep_docs.get(c, []), start=1):
                rec = {'cluster': c, 'rank': rank, 'row_id': int(i)}
                for col in title_cols + body_cols:
                    if col in df.columns:
                        rec[col] = df.loc[i, col]
                evid_records.append(rec)
        evid_df = pd.DataFrame(evid_records)
        if not evid_df.empty:
            evid_df.to_excel(w, index=False, sheet_name="evidence")
        cat_metrics.to_excel(w, index=False, sheet_name="global_metrics")

    log(f"Saved: {base_name}_table.xlsx, {base_name}_outputs.json, {base_name}_cat_metrics.csv, {base_name}_issue_with_rank5.csv")

def run_pipeline(
    excel_paths: List[str],
    base_name: str,
    title_cols: List[str],
    body_cols: List[str],
    date_col: str | None,
    like_col: str | None,
    comment_col: str | None,
    share_col: str | None,
    model_name: str = "paraphrase-multilingual-mpnet-base-v2",
    umap_n_components: int = 30,
    umap_metric: str = "cosine",
    umap_n_neighbors: int = 15,
    umap_min_dist: float = 0.05,
    hdb_metric: str = "euclidean",
    hdb_grid: List[Tuple[int,int]] | None = None,
    kmeans_min_k: int = 8,
    kmeans_max_k: int = 28,
    keyword_top_k: int = 20,
    keyword_min_df: int = 2,
    mmr_k_docs: int = 7,
    mmr_lambda: float = 0.65,
    growth_window_days: int = 7,
    refine_rounds: int = 2,
    hetero_split_thresh: float = 0.30,
    hetero_min_size: int = 20,
    enable_ad_filter: bool = True,
    enable_ratio_filter: bool = True,
    emoji_ratio: float = 0.15,
    url_ratio: float = 0.05,
    enable_privacy_filter: bool = True,
    privacy_thresh: float = 0.15,
    enable_hashtag_bucket: bool = False,
    enable_policy_relevance_rerank: bool = True,
    policy_rerank_weight: float = 0.30,
    seed: int = 42,
    category: str = "AUTO",   # 변경: 기본 AUTO
):
    set_seed(seed)
    category = validate_category(category)

    # 1) Load & merge
    frames = []
    for p in excel_paths:
        log(f"Reading Excel: {p}")
        frames.append(pd.read_excel(p))
    df_all = pd.concat(frames, ignore_index=True)
    log(f"Loaded rows = {len(df_all)}")

    # 2) Excel category 매핑 (있을 때만)
    if "category" in df_all.columns:
        df_all["__cat4__"] = df_all["category"].apply(map_excel_category_to_cat4)
        mapped_cnt = df_all["__cat4__"].notna().sum()
        log(f"Excel 'category' mapped to 4-class: {mapped_cnt}/{len(df_all)} rows")
    else:
        df_all["__cat4__"] = None
        log("Excel has no 'category' column; using CLI --category as single class.")

    # 3) 모델 로드 (공유)
    log("Loading SBERT (normalize=True)...")
    model = SentenceTransformer(model_name)

    # 4) 분기 실행
    def do_run(sub_df: pd.DataFrame, cat: str, suffix: str = ""):
        bn = f"{base_name}{suffix}"
        run_pipeline_for_df(
            df_in=sub_df,
            base_name=bn,
            title_cols=title_cols,
            body_cols=body_cols,
            date_col=date_col,
            like_col=like_col,
            comment_col=comment_col,
            share_col=share_col,
            model=model,
            model_name=model_name,
            umap_n_components=umap_n_components,
            umap_metric=umap_metric,
            umap_n_neighbors=umap_n_neighbors,
            umap_min_dist=umap_min_dist,
            hdb_metric=hdb_metric,
            hdb_grid=hdb_grid,
            kmeans_min_k=kmeans_min_k,
            kmeans_max_k=kmeans_max_k,
            keyword_top_k=keyword_top_k,
            keyword_min_df=keyword_min_df,
            mmr_k_docs=mmr_k_docs,
            mmr_lambda=mmr_lambda,
            growth_window_days=growth_window_days,
            refine_rounds=refine_rounds,
            hetero_split_thresh=hetero_split_thresh,
            hetero_min_size=hetero_min_size,
            enable_ad_filter=enable_ad_filter,
            enable_ratio_filter=enable_ratio_filter,
            emoji_ratio=emoji_ratio,
            url_ratio=url_ratio,
            enable_privacy_filter=enable_privacy_filter,
            privacy_thresh=privacy_thresh,
            enable_hashtag_bucket=enable_hashtag_bucket,
            enable_policy_relevance_rerank=enable_policy_relevance_rerank,
            policy_rerank_weight=policy_rerank_weight,
            seed=seed,
            category=cat,
        )

    if category in {"AUTO","ALL"}:
        # 엑셀의 매핑 결과가 있으면 그 기준으로, 없으면 4개 전부를 단일 클래스처럼 사용하지 않고 경고
        present_cats = []
        if df_all["__cat4__"].notna().any():
            for cat in sorted(CATEGORY_CHOICES):
                sub = df_all[df_all["__cat4__"] == cat]
                if len(sub) > 0:
                    present_cats.append(cat)
            log(f"AUTO categories detected: {present_cats}")
            for cat in present_cats:
                do_run(df_all[df_all["__cat4__"] == cat].reset_index(drop=True), cat, suffix="")
        else:
            # 매핑 불가 시: CLI에 구체 카테고리를 달라고 유도
            raise ValueError("AUTO 모드인데 입력 엑셀에 'category' 컬럼이 없거나 매핑이 되지 않았습니다. --category를 구체적으로 지정하세요.")
    else:
        # 특정 카테고리만 실행
        if df_all["__cat4__"].notna().any():
            sub = df_all[df_all["__cat4__"] == category]
            if len(sub) == 0:
                log(f"Warning: Excel category mapping has no rows for '{category}'. Fallback to all rows.")
                sub = df_all.copy()
        else:
            sub = df_all.copy()
        do_run(sub.reset_index(drop=True), category, suffix=f"_{category}")

# ----------------------------
# No-CLI Config Runner (argparse 제거 버전)
# ----------------------------
from dataclasses import dataclass, field
from typing import Optional, List, Tuple, Union

# 주의: 위쪽에 이미 정의된 run_pipeline, validate_category 를 사용합니다.

@dataclass
class Config:
    # 입력/출력
    excels: List[str]
    base: str
    title_cols: List[str] = field(default_factory=lambda: ['title_norm', 'title'])
    body_cols: List[str] = field(default_factory=lambda: ['content_norm', 'content', 'body'])
    date_col: Optional[str] = 'date'
    like_col: Optional[str] = None
    comment_col: Optional[str] = None
    share_col: Optional[str] = None

    # 모델
    model: str = 'paraphrase-multilingual-mpnet-base-v2'

    # UMAP
    umap_components: int = 30
    umap_metric: str = 'cosine'
    umap_neighbors: int = 15
    umap_min_dist: float = 0.05

    # HDBSCAN
    hdb_metric: str = 'euclidean'
    # '8,4' 같은 문자열 리스트 또는 (8,4) 튜플 리스트 모두 허용
    hdb_grid: Optional[List[Union[str, Tuple[int, int]]]] = None

    # KMeans 폴백
    kmeans_min_k: int = 8
    kmeans_max_k: int = 28

    # 키워드/MMR
    keyword_top_k: int = 20
    keyword_min_df: int = 2
    mmr_k_docs: int = 7
    mmr_lambda: float = 0.65

    # 성장/리파인/이질
    growth_window_days: int = 7
    refine_rounds: int = 2
    hetero_split_thresh: float = 0.30
    hetero_min_size: int = 20

    # 필터 삼진값: None이면 “기본 True”와 동일하게 동작
    enable_ad_filter: Optional[bool] = None
    enable_ratio_filter: Optional[bool] = None
    emoji_ratio: float = 0.15
    url_ratio: float = 0.05

    enable_privacy_filter: Optional[bool] = None
    privacy_thresh: float = 0.15

    # 해시태그 버킷 (기본 False 유지)
    enable_hashtag_bucket: bool = False

    # 정책 관련성 리랭킹
    enable_policy_relevance_rerank: Optional[bool] = None
    policy_rerank_weight: float = 0.30

    # 시드/카테고리
    seed: int = 42
    category: str = 'AUTO'  # AUTO|ALL|개인정보|중대재해|아동복지|금융


def _parse_hdb_grid(grid_arg: Optional[List[Union[str, Tuple[int, int]]]]
                    ) -> Optional[List[Tuple[int, int]]]:
    """'8,4' 문자열 리스트 또는 (8,4) 튜플 리스트를 [(8,4), ...]로 정규화"""
    if not grid_arg:
        return None
    out: List[Tuple[int, int]] = []
    for g in grid_arg:
        if isinstance(g, tuple) and len(g) == 2:
            try:
                out.append((int(g[0]), int(g[1])))
            except Exception:
                pass
        elif isinstance(g, str):
            try:
                a, b = g.split(',')
                out.append((int(a), int(b)))
            except Exception:
                pass
    return out or None


def _resolve_tristate(flag: Optional[bool], default_true: bool = True) -> bool:
    """
    argparse의 (--enable_x / --disable_x) 쌍을 대체:
    - None  -> 기본 True
    - True  -> True
    - False -> False
    """
    if flag is None:
        return default_true
    return bool(flag)


def run_from_config(cfg: Config) -> None:
    """Config 기반 실행. 기존 main()과 동일한 인자 매핑을 보존."""
    # enable/disable 해석(기본 True 유지)
    enable_ad = _resolve_tristate(cfg.enable_ad_filter, default_true=True)
    enable_ratio = _resolve_tristate(cfg.enable_ratio_filter, default_true=True)
    enable_priv = _resolve_tristate(cfg.enable_privacy_filter, default_true=True)
    enable_policy_rr = _resolve_tristate(cfg.enable_policy_relevance_rerank, default_true=True)

    # HDB 그리드 정규화
    grid = _parse_hdb_grid(cfg.hdb_grid)

    # 카테고리 검증
    cat = validate_category(cfg.category)
    
    # 입력 엑셀의 파일명(stem)을 base_name으로 사용
    inferred_base = Path(cfg.excels[0]).stem

    run_pipeline(
        excel_paths=cfg.excels,
        base_name=inferred_base,     # 방어적 접근(동일 결과)
        title_cols=cfg.title_cols,
        body_cols=cfg.body_cols,
        date_col=cfg.date_col if cfg.date_col else None,
        like_col=cfg.like_col,
        comment_col=cfg.comment_col,
        share_col=cfg.share_col,
        model_name=cfg.model,
        umap_n_components=cfg.umap_components,
        umap_metric=cfg.umap_metric,
        umap_n_neighbors=cfg.umap_neighbors,
        umap_min_dist=cfg.umap_min_dist,
        hdb_metric=cfg.hdb_metric,
        hdb_grid=grid,
        kmeans_min_k=cfg.kmeans_min_k,
        kmeans_max_k=cfg.kmeans_max_k,
        keyword_top_k=cfg.keyword_top_k,
        keyword_min_df=cfg.keyword_min_df,
        mmr_k_docs=cfg.mmr_k_docs,
        mmr_lambda=cfg.mmr_lambda,
        growth_window_days=cfg.growth_window_days,
        refine_rounds=cfg.refine_rounds,
        hetero_split_thresh=cfg.hetero_split_thresh,
        hetero_min_size=cfg.hetero_min_size,
        enable_ad_filter=enable_ad,
        enable_ratio_filter=enable_ratio,
        emoji_ratio=cfg.emoji_ratio,
        url_ratio=cfg.url_ratio,
        enable_privacy_filter=enable_priv,
        privacy_thresh=cfg.privacy_thresh,
        enable_hashtag_bucket=cfg.enable_hashtag_bucket,
        enable_policy_relevance_rerank=enable_policy_rr,
        policy_rerank_weight=cfg.policy_rerank_weight,
        seed=cfg.seed,
        category=cat,
    )


if __name__ == '__main__':

    CONFIG = Config(
        excels=['preprocess_twitter_part_1.xlsx'],
        base='',
        category='AUTO',

        # 필터: None=기본(True). 원래 CLI에서 켜던 것들을 True로 지정
        enable_ad_filter=True,
        enable_ratio_filter=True,

        enable_privacy_filter=True,
        privacy_thresh=0.20,

        # UMAP
        umap_components=30,
        umap_neighbors=15,
        umap_min_dist=0.05,

        # HDBSCAN
        hdb_metric='euclidean',
        hdb_grid=['8,4', '10,5', '12,6', '15,5', '15,8', '20,10'],

        # KMeans
        kmeans_min_k=8,
        kmeans_max_k=28,

        # 키워드/MMR
        keyword_top_k=20,
        mmr_k_docs=7,

        # 리파인/이질
        refine_rounds=2,
        hetero_split_thresh=0.30,
        hetero_min_size=20,

        # 정책 리랭킹
        enable_policy_relevance_rerank=True,
        policy_rerank_weight=0.30,

        # 시드
        seed=42,
    )

    run_from_config(CONFIG)


[INFO] Reading Excel: preprocess_community_part_1.xlsx
[INFO] Loaded rows = 45535
[INFO] Excel 'category' mapped to 4-class: 45535/45535 rows
[INFO] Loading SBERT (normalize=True)...
[INFO] AUTO categories detected: ['개인정보']
[INFO] [category]=개인정보 | rows=45535


  mask = ~df[text_col].fillna("").astype(str).str.contains(pat)


[INFO] Ad/Spam dropped: 15680 rows removed
[INFO] Ratio-based dropped: 2 rows removed


Batches: 100%|██████████| 467/467 [01:14<00:00,  6.27it/s]


[INFO] [category]=개인정보 | PRIV=14 terms, POLICY=18 terms
[INFO] Applying privacy semantic filter (thresh=0.2)...
[INFO] Privacy keep: 29247/29853
[INFO] UMAP reducing (components=30, neighbors=15, min_dist=0.05)...


  warn(


[INFO] HDBSCAN grid search...


  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
  warn(
  warn(
  warn(
  warn(


[INFO] Chosen HDBSCAN: min_cluster_size=10 min_samples=5 dbcv=-1.000 noise=0.221
[INFO] Refining clusters recursively (rounds=2)...


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[INFO] Splitting heterogeneous clusters (thresh=0.3, min_size=20)...


  warn(
  warn(


[INFO] Extracting keywords (n-gram + NPMI/KL + SBERT MMR)...


Tokenizing: 100%|██████████| 29247/29247 [00:10<00:00, 2808.45it/s]


KeyboardInterrupt: 