
[뉴스·SNS 기반 입법 수요 탐지 - SBERT + 고도화 키워드 + 정량지표 통합 버전]

- 임베딩: SBERT
- 차원축소/클러스터링: UMAP + HDBSCAN
- 키워드: c-TF-IDF → 조사/어미 제거 + 중복 제거 → SBERT 기반 재랭킹(MMR)
- 대표문장/대표타이틀: SBERT 임베딩 중심성 기반
- 정량지표:
  · 전역(카테고리): DBCV, Silhouette, Noise ratio, n_clusters, mean_membership_prob, persistence_mean/median
  · 군집: intra_sim_mean, inter_sim_max, separation_margin, keyword_npmi, keyword_diversity
  · 이슈 중요도(salience_score): 빈도·성장률·참여·키워드결속도·분리마진 가중합

- 결과: CSV / JSON / Excel
  · {base}_table.csv            : 군집별 상세 + 정량지표 + 중요도
  · {base}_outputs.json         : 예시 출력(JSON)
  · {base}_cat_metrics.csv      : 카테고리 전역 지표

-------------------------------------------------
[Conda 환경 설치 가이드]
```
conda create -n law-issue python=3.10 -y
conda activate law-issue
conda install -c conda-forge numpy pandas scikit-learn openpyxl umap-learn hdbscan -y
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install sentence-transformers tqdm
```
-------------------------------------------------


In [6]:

import os, re, json, argparse, warnings
from datetime import timedelta
from collections import defaultdict

import numpy as np
import pandas as pd
import torch, umap, hdbscan

from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore", category=RuntimeWarning)

# -----------------------------
# SBERT 임베딩
# -----------------------------
_SBERT_MODEL = None
def embed_texts_sbert(texts, model_name="paraphrase-multilingual-mpnet-base-v2",
                      batch_size=64, device=None):
    """
    SBERT 임베딩 (정규화 포함)
    """
    global _SBERT_MODEL
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    if _SBERT_MODEL is None:
        print(f"[INFO] Loading SBERT model: {model_name} on {device}")
        _SBERT_MODEL = SentenceTransformer(model_name, device=device)

    emb = _SBERT_MODEL.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    return emb

# -----------------------------
# 조사/어미 제거 + 토크나이즈
# -----------------------------
_JOSA_SUFFIXES = [
    "은","는","이","가","을","를","과","와","도","만","으로","로","에서","에게","한테",
    "께","에","의","라고","라는","이라","이나","이며","인데","하다","했다","하는","되다","된다","되는"
]

def strip_josa(token):
    for suf in sorted(_JOSA_SUFFIXES, key=len, reverse=True):
        if token.endswith(suf) and len(token) > len(suf)+1:
            return token[: -len(suf)]
    return token

def tokenize_ko(text):
    raw = re.findall(r"[가-힣A-Za-z0-9]{2,}", text)
    toks = []
    for t in raw:
        base = strip_josa(t)
        if base and len(base) >= 2:
            toks.append(base)
    return toks

def reduce_terms_by_containment(terms, top_k=8):
    out = []
    for t in terms:
        if any((t == s) or (t in s) or (s in t) for s in out):
            continue
        out.append(t)
        if len(out) >= top_k:
            break
    return out

# -----------------------------
# SBERT 기반 키워드 재랭킹 (MMR)
# -----------------------------
def rerank_keywords_with_sbert(cluster_texts, candidates, model_name="paraphrase-multilingual-mpnet-base-v2",
                               top_k=8, diversity=0.7):
    if not candidates:
        return []
    docs_emb = embed_texts_sbert(cluster_texts, model_name=model_name)
    centroid = docs_emb.mean(axis=0, keepdims=True)
    cands_emb = embed_texts_sbert(candidates, model_name=model_name)
    sims = cosine_similarity(cands_emb, centroid).ravel()

    selected, cand_idx = [], list(range(len(candidates)))
    while cand_idx and len(selected) < top_k:
        if not selected:
            i = int(np.argmax(sims[cand_idx]))
            selected.append(cand_idx.pop(i))
        else:
            sel_emb = cands_emb[selected]
            div = cosine_similarity(cands_emb[cand_idx], sel_emb).max(axis=1)
            mmr = (1 - diversity) * sims[cand_idx] - diversity * div
            i = int(np.argmax(mmr))
            selected.append(cand_idx.pop(i))
    return [candidates[i] for i in selected]

# -----------------------------
# 대표문장 (SBERT 기반)
# -----------------------------
def pick_representative_sentence_by_sbert(texts, model_name="paraphrase-multilingual-mpnet-base-v2"):
    if not texts:
        return ""
    emb = embed_texts_sbert(texts, model_name=model_name)
    centroid = emb.mean(axis=0, keepdims=True)
    d = cosine_distances(emb, centroid).ravel()
    return texts[int(np.argmin(d))]

# -----------------------------
# c-TF-IDF (후보 키워드 추출)
# -----------------------------
def compute_c_tf_idf(docs_per_cluster, ngram_range=(1, 3), min_df=2, epsilon=1e-9):
    cluster_ids = sorted(docs_per_cluster.keys())
    cluster_docs = [" ".join(docs_per_cluster[cid]) for cid in cluster_ids]
    preproc_docs = [" ".join(tokenize_ko(doc)) for doc in cluster_docs]

    cv = CountVectorizer(ngram_range=ngram_range, min_df=min_df)
    X = cv.fit_transform(preproc_docs)
    terms = np.array(cv.get_feature_names_out())

    df_t = (X > 0).sum(axis=0).A1
    idf = np.log((len(cluster_ids) + 1) / (df_t + epsilon))
    row_sums = np.asarray(X.sum(axis=1)).ravel()
    row_sums[row_sums == 0] = 1
    X_norm = X.multiply(1.0 / row_sums[:, None])
    ctfidf_mat = X_norm.multiply(idf)

    ctfidf = {}
    for i, cid in enumerate(cluster_ids):
        row = ctfidf_mat.getrow(i).toarray().ravel()
        order = np.argsort(row)[::-1]
        ctfidf[cid] = (terms[order], row[order])
    return ctfidf

# -----------------------------
# 시간/텍스트 유틸
# -----------------------------
def to_datetime_ymdhms(x):
    try:
        return pd.to_datetime(str(int(x)), format="%Y%m%d%H%M%S")
    except Exception:
        try:
            return pd.to_datetime(x)
        except Exception:
            return pd.NaT

def build_text(row, max_len=600):
    t = row.get("title_norm") if pd.notna(row.get("title_norm")) else row.get("title", "")
    c = row.get("content_norm") if pd.notna(row.get("content_norm")) else row.get("content", "")
    return (str(t) + " " + str(c)[:max_len]).strip()

def safe_sum_frame(sub_df, cols):
    if not cols: return 0
    vals = sub_df[cols].apply(pd.to_numeric, errors="coerce").fillna(0).sum(axis=1)
    return int(vals.sum())

def find_cols(df, key): return [c for c in df.columns if key in c.lower()]

def compute_growth(sub_df, date_col="date_dt", window_days=7):
    if date_col not in sub_df.columns or sub_df[date_col].isna().all():
        return np.nan
    sub = sub_df.dropna(subset=[date_col]).copy()
    if sub.empty: return np.nan
    t_max = sub[date_col].max()
    recent_start, prev_start = t_max - timedelta(days=window_days), t_max - timedelta(days=2*window_days)
    recent = sub[(sub[date_col] > recent_start) & (sub[date_col] <= t_max)]
    prev = sub[(sub[date_col] > prev_start) & (sub[date_col] <= recent_start)]
    return (len(recent) - max(1, len(prev))) / max(1, len(prev))

# -----------------------------
# 정량지표: 전역/군집 품질
# -----------------------------
def cluster_validity_indices(emb_umap, labels):
    # Silhouette (노이즈 제외)
    mask = labels != -1
    if mask.sum() > 1 and len(np.unique(labels[mask])) > 1:
        sil = float(silhouette_score(emb_umap[mask], labels[mask], metric="euclidean"))
    else:
        sil = np.nan
    return {"silhouette": sil}

def hdbscan_global_validity(emb_raw, labels):
    # HDBSCAN 전용 DBCV (원 임베딩 권장)
    try:
        v = float(hdbscan.validity_index(emb_raw, labels))
    except Exception:
        v = np.nan
    return {"dbcv": v}

def cluster_membership_stats(probabilities, labels, clusterer):
    # 확률/안정성(퍼시스턴스) 통계
    mask = labels != -1
    mean_prob = float(np.nanmean(probabilities[mask])) if mask.any() else np.nan
    persist = getattr(clusterer, "cluster_persistence_", None)
    if persist is not None and len(persist) > 0:
        persist_mean = float(np.mean(persist))
        persist_median = float(np.median(persist))
    else:
        persist_mean = persist_median = np.nan
    noise_ratio = float(np.mean(labels == -1)) if len(labels) > 0 else np.nan
    n_clusters = int(len(np.unique(labels[labels!=-1])))
    return {
        "mean_membership_prob": mean_prob,
        "persistence_mean": persist_mean,
        "persistence_median": persist_median,
        "noise_ratio": noise_ratio,
        "n_clusters": n_clusters,
    }

def per_cluster_intra_inter_similarity(emb_raw, labels):
    """
    군집별:
      - intra_sim_mean: 군집 내 평균 코사인 유사도
      - inter_sim_max: 해당 군집과 타 군집 간 최대 코사인 유사도 (낮을수록 분리 양호)
      - separation_margin = intra_sim_mean - inter_sim_max
    """
    sims = cosine_similarity(emb_raw)
    stats = {}
    cluster_ids = [cid for cid in np.unique(labels) if cid != -1]
    idx_per = {cid: np.where(labels==cid)[0] for cid in cluster_ids}
    others = np.where(labels != -1)[0]

    for cid in cluster_ids:
        idx = idx_per[cid]
        # intra
        if len(idx) >= 2:
            S = sims[np.ix_(idx, idx)]
            intra_mean = float(np.mean(S[np.triu_indices_from(S, k=1)]))
        elif len(idx) == 1:
            intra_mean = np.nan
        else:
            intra_mean = np.nan

        # inter
        other_idx = np.where((labels!=-1) & (labels!=cid))[0]
        if len(idx)>0 and len(other_idx)>0:
            S2 = sims[np.ix_(idx, other_idx)]
            inter_max = float(np.max(S2))
        else:
            inter_max = np.nan

        if (not np.isnan(intra_mean)) and (not np.isnan(inter_max)):
            sep_margin = float(intra_mean - inter_max)
        else:
            sep_margin = np.nan

        stats[int(cid)] = {
            "intra_sim_mean": intra_mean,
            "inter_sim_max": inter_max,
            "separation_margin": sep_margin
        }
    return stats

# -----------------------------
# 키워드 결속도(NPMI) & 다양성
# -----------------------------
def keyword_npmi_coherence(corpus_tokens, top_terms, window=None):
    """
    간단 NPMI: 문서 수준 공출현(윈도우 생략) 기반
    """
    from math import log
    if not top_terms or len(top_terms) < 2:
        return np.nan
    N = len(corpus_tokens)
    if N == 0: return np.nan

    terms = top_terms[:min(10, len(top_terms))]
    pairs = [(a,b) for i,a in enumerate(terms) for b in terms[i+1:]]

    def term_in_doc(t, toks): return t in toks
    def pair_in_doc(a,b,toks): return (a in toks) and (b in toks)

    pa = {t: (sum(term_in_doc(t, d) for d in corpus_tokens)+1) / (N+1) for t in terms}
    cab = {}
    for a,b in pairs:
        cab[(a,b)] = (sum(pair_in_doc(a,b,d) for d in corpus_tokens)+1) / (N+1)

    npmies = []
    for a,b in pairs:
        p_ab = cab[(a,b)]
        p_a, p_b = pa[a], pa[b]
        if p_ab <= 0 or p_ab >= 1.0:
            continue  # log(1)=0 → 분모 0 에러 방지
        pmi = log(p_ab / (p_a*p_b))
        npmi = pmi / (-log(p_ab))
        npmies.append(npmi)
    return float(np.mean(npmies)) if npmies else np.nan

def keyword_diversity_score(terms):
    if not terms: return np.nan
    uniq = set(terms)
    diversity = len(uniq) / max(1, len(terms))
    return float(diversity)

# -----------------------------
# 중요도(우선순위) 종합 점수
# -----------------------------
def zscore(x):
    x = np.array(x, dtype=float)
    m = np.nanmean(x); s = np.nanstd(x)
    return (x - m) / (s + 1e-9)

def compute_salience_per_cluster(rows):
    """
    rows: 같은 카테고리 내 군집 dict들의 리스트
    각 dict에 'salience_score'를 삽입
    """
    F = np.array([r.get("cluster_size", np.nan) for r in rows])
    G = np.array([r.get("growth_rate(7d/prev7d)", np.nan) for r in rows])
    E = np.array([ (r.get("eng_like",0)+r.get("eng_reply",0)+r.get("eng_share",0)) for r in rows], dtype=float)
    C = np.array([r.get("keyword_npmi", np.nan) for r in rows])
    M = np.array([r.get("separation_margin", np.nan) for r in rows])

    score = 0.35*zscore(F) + 0.25*zscore(G) + 0.20*zscore(E) + 0.10*zscore(C) + 0.10*zscore(M)
    for i, r in enumerate(rows):
        r["salience_score"] = float(score[i])
    return rows

# -----------------------------
# 메인 파이프라인
# -----------------------------
def run_pipeline_by_category(
    excel_path: str,
    sbert_model="paraphrase-multilingual-mpnet-base-v2",
    umap_n_components=10,
    umap_n_neighbors=15,
    umap_min_dist=0.1,
    hdbscan_min_cluster_size=25,
    hdbscan_min_samples=10,
    ctfidf_top_k=8,
    seed=42
):
    df = pd.read_excel(excel_path)
    if "date" in df.columns: df["date_dt"] = df["date"].apply(to_datetime_ymdhms)
    else: df["date_dt"] = pd.NaT
    df["text"] = df.apply(build_text, axis=1)
    df = df[df["text"].str.len() > 0].copy()

    results, json_outputs, cat_metrics_rows = [], [], []

    for cat, g in df.groupby("category"):
        if not isinstance(cat, str) or len(cat.strip()) == 0: 
            continue
        print(f"[INFO] Processing category group: {cat} | size={len(g)}")
        if len(g) < 10: 
            continue

        # ---------------- Embedding & UMAP ----------------
        emb = embed_texts_sbert(g["text"].tolist(), model_name=sbert_model)
        reducer = umap.UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components,
                            min_dist=umap_min_dist, metric="cosine", random_state=seed)
        X_umap = reducer.fit_transform(emb)
        X_umap_df = pd.DataFrame(X_umap, index=g.index)

        # ---------------- HDBSCAN ----------------
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=hdbscan_min_cluster_size,
            min_samples=hdbscan_min_samples,
            metric="euclidean",
            prediction_data=True
        )
        labels = clusterer.fit_predict(X_umap_df.values)
        g = g.copy(); g["cluster_id"] = labels
        prob = getattr(clusterer, "probabilities_", np.ones(len(g)))

        # ---- 전역(카테고리) 지표 ----
        glob = {}
        glob.update(cluster_validity_indices(X_umap_df.values, labels))
        glob.update(hdbscan_global_validity(emb, labels))       # 원 임베딩 기준 권장
        glob.update(cluster_membership_stats(prob, labels, clusterer))

        cat_metrics_rows.append({
            "category": cat,
            **glob
        })

        # ---- 군집별 분리/응집 (원 임베딩 기준) ----
        per_c_stats = per_cluster_intra_inter_similarity(emb, labels)

        # ---- c-TF-IDF 후보 키워드 ----
        docs_per_cluster = {int(cid): sub["text"].tolist() for cid, sub in g.groupby("cluster_id") if cid != -1}
        if not docs_per_cluster: 
            continue
        ctfidf = compute_c_tf_idf(docs_per_cluster, ngram_range=(1,3), min_df=2)

        # ---- 문서 토큰 캐시 (키워드 결속도용) ----
        doc_tokens = [tokenize_ko(t) for t in g["text"].tolist()]
        idx_to_pos = {idx:i for i,idx in enumerate(g.index)}
        doc_tokens_by_cluster = defaultdict(list)
        for idx, cid in zip(g.index, labels):
            if cid != -1:
                pos = idx_to_pos[idx]
                doc_tokens_by_cluster[int(cid)].append(doc_tokens[pos])

        # ---- 군집 루프 ----
        for cid, sub in g.groupby("cluster_id"):
            if cid == -1: 
                continue
            size = len(sub)

            # 키워드 (ctfidf → 중복/포함 제거 → SBERT-MMR 재랭킹)
            terms, _ = ctfidf.get(int(cid), (np.array([]), np.array([])))
            raw_terms = reduce_terms_by_containment(terms.tolist(), top_k=50)
            keywords = rerank_keywords_with_sbert(sub["text"].tolist(), raw_terms, model_name=sbert_model,
                                                  top_k=ctfidf_top_k, diversity=0.7)

            # 대표 문장 & 대표 타이틀
            rep_sent = pick_representative_sentence_by_sbert(sub["text"].tolist(), model_name=sbert_model)
            sub_emb = embed_texts_sbert(sub["text"].tolist(), model_name=sbert_model)
            centroid = sub_emb.mean(axis=0, keepdims=True)
            d = cosine_distances(sub_emb, centroid).ravel()
            rep_titles = sub.iloc[np.argsort(d)[:3]]["title"].tolist()

            # 성장/참여도
            growth_rate = compute_growth(sub)
            like_total  = safe_sum_frame(sub, find_cols(sub, "like"))
            reply_total = safe_sum_frame(sub, [c for c in sub.columns if ("reply" in c.lower() or "comment" in c.lower())])
            share_total = safe_sum_frame(sub, find_cols(sub, "share"))

            # 군집 품질 지표
            stats_c = per_c_stats.get(int(cid), {"intra_sim_mean":np.nan,"inter_sim_max":np.nan,"separation_margin":np.nan})

            # 키워드 결속/다양성
            corpus_c = doc_tokens_by_cluster.get(int(cid), [])
            kw_npmi = keyword_npmi_coherence(corpus_c, keywords)
            kw_div  = keyword_diversity_score(keywords)

            # JSON 예시 출력
            json_outputs.append({
                "example_output": {
                    "category": cat, "cluster_id": int(cid),
                    "issue_summary": {"keywords": keywords, "representative_sentence": rep_sent},
                    "representative_titles": rep_titles, "size": size
                },
                "metrics": {
                    "freq": size,
                    "growth_rate": None if pd.isna(growth_rate) else float(np.round(growth_rate,4)),
                    "engagement": {"share": int(share_total), "reply": int(reply_total), "like": int(like_total)}
                }
            })

            # CSV 결과 행
            results.append({
                "category": cat, "cluster_id": int(cid),
                "keywords": " ".join(keywords),
                "representative_sentence": rep_sent,
                "rep_title_1": rep_titles[0] if len(rep_titles)>0 else "",
                "rep_title_2": rep_titles[1] if len(rep_titles)>1 else "",
                "rep_title_3": rep_titles[2] if len(rep_titles)>2 else "",
                "cluster_size": size,
                "growth_rate(7d/prev7d)": None if pd.isna(growth_rate) else float(np.round(growth_rate,4)),
                "eng_like": int(like_total),"eng_reply": int(reply_total),"eng_share": int(share_total),

                # 전역 지표(카테고리 수준) 부착
                "dbcv": glob["dbcv"],
                "silhouette": glob["silhouette"],
                "mean_membership_prob": glob["mean_membership_prob"],
                "persistence_mean": glob["persistence_mean"],
                "persistence_median": glob["persistence_median"],
                "noise_ratio": glob["noise_ratio"],
                "n_clusters": glob["n_clusters"],

                # 군집 지표
                "intra_sim_mean": stats_c["intra_sim_mean"],
                "inter_sim_max": stats_c["inter_sim_max"],
                "separation_margin": stats_c["separation_margin"],
                "keyword_npmi": kw_npmi,
                "keyword_diversity": kw_div,
            })

        # ---- 같은 카테고리 내 군집들의 salience 산출 ----
        # (결과 리스트 중 해당 카테고리 행들만 추출하여 가중합 스코어 부여)
        idxs = [i for i,r in enumerate(results) if r["category"]==cat]
        rows = [results[i] for i in idxs]
        rows = compute_salience_per_cluster(rows)
        for off, i in enumerate(idxs):
            results[i] = rows[off]

    # ---------------- 저장 ----------------
    base = os.path.splitext(os.path.basename(excel_path))[0]
    # 1) 군집 테이블 (Excel 저장)
    df_results = pd.DataFrame(results).sort_values(
        ["category","salience_score","cluster_size"], ascending=[True, False, False]
    )
    df_results = df_results.rename(columns={"salience_score": "final_rank_score"})
    excel_path_out = f"{base}_table.xlsx"
    df_results.to_excel(excel_path_out, index=False, engine="openpyxl")
    print(f"[OK] Excel file saved: {excel_path_out}")
    # 2) JSON 예시
    with open(f"{base}_outputs.json", "w", encoding="utf-8") as f: 
        json.dump(json_outputs, f, ensure_ascii=False, indent=2)
    # 3) 카테고리 전역 지표
    pd.DataFrame(cat_metrics_rows).sort_values(["category"]).to_csv(
        f"{base}_cat_metrics.csv", index=False, encoding="utf-8-sig"
    )

    print(f"[OK] Saved table/json/cat-metrics for {excel_path}")

if __name__ == "__main__":
    # ---------------------------
    # 직접 인자 지정 (argparse 제거)
    # ---------------------------
    EXCEL_PATH = "preprocess_news_part_4_with_metrics.xlsx" ## 입력 엑셀 파일 경로
    SBERT_MODEL = "paraphrase-multilingual-mpnet-base-v2"

    run_pipeline_by_category(
        excel_path=EXCEL_PATH,
        sbert_model=SBERT_MODEL,
        umap_n_components=10,
        umap_n_neighbors=15,
        umap_min_dist=0.1,
        hdbscan_min_cluster_size=25,
        hdbscan_min_samples=10,
        ctfidf_top_k=8,
        seed=42
    )
    


[INFO] Processing category group: 중대재해처벌법 | size=6969
[INFO] Loading SBERT model: paraphrase-multilingual-mpnet-base-v2 on cuda


  warn(


[OK] Excel file saved: preprocess_news_part_4_with_metrics_table.xlsx
[OK] Saved table/json/cat-metrics for preprocess_news_part_4_with_metrics.xlsx
