# Ulysses narrative GMM: boundary (entropy) report + turning points Top-K

このノートブックは、`ulysses_stream.csv`（特徴量CSV）と `Ulysses_fixed.json`（根拠文付きJSON）から、

- GMMで潜在状態（K個）を推定
- 各時点の posterior entropy を **boundaryness**（境界っぽさ）として算出
- entropy 上位（ALPHA%）のスパンを **前後evidence付きでレポート化**
- 本文・論文に貼れる **turning points TopK表** を生成

を **単独でColab実行**できる形でまとめます。


In [None]:
# ============================================================
# 0) Setup (Colab)
# ============================================================
# 基本はColab標準で動きます。もしsklearnが無い環境なら以下を実行してください。
# !pip -q install scikit-learn pandas numpy

from __future__ import annotations

import json
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, Tuple, Any, List, Optional

import numpy as np
import pandas as pd

from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

# Colabで display を使うため
from IPython.display import display

RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)


In [None]:
# ============================================================
# 1) Config (ここだけ触ればOK)
# ============================================================
@dataclass
class Config:
    csv_path: str = "/content/ulysses_stream.csv"
    json_path: str = "/content/Ulysses_fixed.json"

    # Model
    k: int = 8
    covariance_type: str = "diag"     # "diag" 推奨（安定・軽量）
    n_init: int = 10
    max_iter: int = 500
    reg_covar: float = 1e-6

    # Boundary selection
    alpha: float = 0.05               # entropy上位割合（0.05=上位5%）

    # Output
    out_dir: str = "/content/out"
    turning_points_topk: int = 10
    strong_margin_max: float = 0.10     # “強い境界”の margin 上限（例: 0.10）
    strong_topk: int = 10               # 強い境界 TopK
    transition_topk: int = 10           # 遷移点（state_change）TopK
    shorten_text: bool = False
    maxlen: int = 160                 # shorten_text=True のとき有効

CFG = Config()

print(CFG)


In [None]:
# ============================================================
# 2) (Option) Upload your data files
# ============================================================
# - すでに /content に置いてあるなら、このセルは不要です。
# - Colab左の「ファイル」→アップロードでもOKです。
#
# 使い方：
# 1) 実行するとファイル選択ダイアログが出ます
# 2) ulysses_stream.csv と Ulysses_fixed.json をアップロード
# 3) CFG.csv_path / CFG.json_path が自動で更新されます

try:
    from google.colab import files
    uploaded = files.upload()
    # アップロードされたファイル名から自動設定
    for fn in uploaded.keys():
        if fn.lower().endswith(".csv") and "ulysses" in fn.lower():
            CFG.csv_path = f"/content/{fn}"
        if fn.lower().endswith(".json") and "ulysses" in fn.lower():
            CFG.json_path = f"/content/{fn}"
    print("Updated paths:", CFG.csv_path, CFG.json_path)
except Exception as e:
    print("Not running on Colab or upload skipped:", e)


In [None]:
# ============================================================
# 3) Helpers
# ============================================================
def pick_evidence(row: pd.Series) -> str:
    """Prefer Japanese evidence if available, else fall back safely."""
    for col in ["evidence_ja", "evidence_en", "evidence", "quote", "text", "raw_text"]:
        if col in row.index:
            v = row[col]
            if isinstance(v, str) and v.strip():
                return v.strip()
    return ""

def pick_text(d: Dict[str, Any], keys: List[str]) -> str:
    for k in keys:
        v = d.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return ""

def list_feature_cols(df: pd.DataFrame) -> List[str]:
    prefixes = ("mode_", "cause_", "place_", "myth_", "style_")
    cols = [c for c in df.columns if c.startswith(prefixes)]
    if not cols:
        raise ValueError("No feature columns found. Expected prefixes: mode_/cause_/place_/myth_/style_.")
    return cols

def ensure_dir(p: str) -> Path:
    path = Path(p)
    path.mkdir(parents=True, exist_ok=True)
    return path

def flatten_json_spans(js: Any) -> List[Dict[str, Any]]:
    """Support both: [chapter{time_series_data:[]}, ...] and {time_series_data:[]} and raw span list."""
    spans: List[Dict[str, Any]] = []
    if isinstance(js, list):
        for item in js:
            if isinstance(item, dict) and "time_series_data" in item:
                spans.extend(item.get("time_series_data", []))
            elif isinstance(item, dict):
                spans.append(item)
    elif isinstance(js, dict):
        spans = js.get("time_series_data", []) or []
    return [s for s in spans if isinstance(s, dict)]

def build_span_index(spans: List[Dict[str, Any]]) -> Dict[Tuple[int, int], Dict[str, Any]]:
    """Index by (episode, global_step)."""
    idx: Dict[Tuple[int, int], Dict[str, Any]] = {}
    for s in spans:
        ep = s.get("episode")
        gs = s.get("global_step")
        if ep is None or gs is None:
            continue
        try:
            idx[(int(ep), int(gs))] = s
        except Exception:
            continue
    return idx

def infer_keys(df: pd.DataFrame) -> Tuple[str, str]:
    """Return (episode_col, step_col) for CSV side."""
    # preferred
    if "episode" in df.columns and "global_step" in df.columns:
        return "episode", "global_step"
    # common in your pipeline
    if "chapter" in df.columns and "span_id" in df.columns:
        return "chapter", "span_id"
    # last resort
    return "", ""

def shorten(s: str, maxlen: int) -> str:
    s = (s or "").replace("\n", " ").strip()
    return s if len(s) <= maxlen else s[:maxlen] + "…"


In [None]:
# ============================================================
# 4) Load CSV
# ============================================================
csv_path = Path(CFG.csv_path)
if not csv_path.exists():
    raise FileNotFoundError(f"CSV not found: {csv_path}")

df = pd.read_csv(csv_path)

feature_cols = list_feature_cols(df)
X = df[feature_cols].to_numpy()

episode_col, step_col = infer_keys(df)
print("rows:", len(df))
print("feature_cols:", len(feature_cols))
print("csv keys:", (episode_col, step_col))

df.head(3)


In [None]:
# ============================================================
# 5) Fit GMM + compute posterior entropy / margin
# ============================================================
Xz = StandardScaler().fit_transform(X)

gmm = GaussianMixture(
    n_components=CFG.k,
    covariance_type=CFG.covariance_type,
    random_state=RANDOM_SEED,
    reg_covar=CFG.reg_covar,
    max_iter=CFG.max_iter,
    n_init=CFG.n_init
)
gmm.fit(Xz)

resp = gmm.predict_proba(Xz)  # (N, K)
entropy = -(resp * np.log(resp + 1e-12)).sum(axis=1)

gmm_state = resp.argmax(axis=1)          # 0-based
gmm_state_1based = gmm_state + 1         # 1-based
state_change = np.r_[False, gmm_state[1:] != gmm_state[:-1]]

# Top2 + margin
order = np.argsort(-resp, axis=1)        # desc
s1 = order[:, 0]
s2 = order[:, 1]
p1 = resp[np.arange(len(df)), s1]
p2 = resp[np.arange(len(df)), s2]
margin = p1 - p2

print("entropy stats:",
      "min=", float(entropy.min()),
      "median=", float(np.median(entropy)),
      "p95=", float(np.quantile(entropy, 0.95)),
      "max=", float(entropy.max()))

print("state_change rate:", float(state_change.mean()))


In [None]:
# ============================================================
# 6) boundary candidates: entropy top ALPHA%
# ============================================================
if not (0 < CFG.alpha <= 1):
    raise ValueError("alpha must be in (0, 1].")

n_top = int(np.ceil(len(df) * CFG.alpha))
top_idx = np.argsort(entropy)[-n_top:][::-1]  # high entropy first

print("ALPHA:", CFG.alpha, "=> n_top:", n_top)
print("top entropy:", float(entropy[top_idx[0]]), "at row", int(top_idx[0]))


In [None]:
# ============================================================
# 7) Load JSON + index spans
# ============================================================
json_path = Path(CFG.json_path)
if not json_path.exists():
    raise FileNotFoundError(f"JSON not found: {json_path}")

with open(json_path, "r", encoding="utf-8") as f:
    js = json.load(f)

spans = flatten_json_spans(js)
span_index = build_span_index(spans)

print("json spans:", len(spans))
print("indexed spans:", len(span_index))


In [None]:
# ============================================================
# 8) Build boundary_report.csv (evidence prev/next + json span text)
# ============================================================
rows = []
missing_json = 0

for i in top_idx:
    i = int(i)
    # CSV keys
    ep = int(df.loc[i, episode_col]) if episode_col else -1
    gs = int(df.loc[i, step_col]) if step_col else i

    tt = df.loc[i, "transition_type"] if "transition_type" in df.columns else ""

    ev = pick_evidence(df.loc[i])
    ev_prev = pick_evidence(df.loc[i - 1]) if i - 1 >= 0 else ""
    ev_next = pick_evidence(df.loc[i + 1]) if i + 1 < len(df) else ""

    s = span_index.get((ep, gs))
    if s is None:
        missing_json += 1
        s = {}

    row = {
        "row_index": i,
        "episode": ep,
        "global_step": gs,
        "transition_type": tt,

        "entropy": float(entropy[i]),
        "gmm_state_1based": int(gmm_state_1based[i]),
        "state_change": bool(state_change[i]),

        "top1_state_1based": int(s1[i] + 1),
        "top1_p": float(p1[i]),
        "top2_state_1based": int(s2[i] + 1),
        "top2_p": float(p2[i]),
        "margin": float(margin[i]),

        "evidence_prev": ev_prev,
        "evidence": ev,
        "evidence_next": ev_next,

        "json_span_text_en": pick_text(s, ["span_text_en", "text_en"]),
        "json_span_text_ja": pick_text(s, ["span_text_ja", "text_ja"]),
        "json_evidence_en": pick_text(s, ["evidence_en"]),
        "json_evidence_ja": pick_text(s, ["evidence_ja"]),
    }

    rows.append(row)

rep = pd.DataFrame(rows)

if CFG.shorten_text:
    for c in ["evidence_prev", "evidence", "evidence_next", "json_span_text_en", "json_span_text_ja"]:
        if c in rep.columns:
            rep[c] = rep[c].astype(str).map(lambda x: shorten(x, CFG.maxlen))

out_dir = ensure_dir(CFG.out_dir)
rep_path = out_dir / "boundary_report.csv"
rep.to_csv(rep_path, index=False, encoding="utf-8-sig")

print("saved:", rep_path)
print("json span missing for", missing_json, "rows (key mismatch or partial json).")

display(rep.head(10))


In [None]:
# ============================================================
# 9) turning_points_topK.csv/tsv (paper-ready table)
#    - sort by entropy desc, tie-break by margin asc
# ============================================================
tp = pd.DataFrame({
    "episode": df[episode_col].astype(int) if episode_col else pd.Series([-1]*len(df)),
    "global_step": df[step_col].astype(int) if step_col else pd.Series(np.arange(len(df))),

    "transition_type": df["transition_type"] if "transition_type" in df.columns else "",
    "entropy": entropy,
    "gmm_state_1based": (gmm_state + 1).astype(int),

    "top1_state_1based": (s1 + 1).astype(int),
    "top1_p": p1.astype(float),
    "top2_state_1based": (s2 + 1).astype(int),
    "top2_p": p2.astype(float),
    "margin": margin.astype(float),

    "evidence_best": df.apply(pick_evidence, axis=1),
})

tp = tp.sort_values(["entropy", "margin"], ascending=[False, True]).copy()

tp_top = tp.head(CFG.turning_points_topk).copy()
tp_top.insert(0, "rank", np.arange(1, len(tp_top) + 1))

tp_csv = out_dir / "turning_points_top10.csv"
tp_tsv = out_dir / "turning_points_top10.tsv"
tp_top.to_csv(tp_csv, index=False, encoding="utf-8-sig")
tp_top.to_csv(tp_tsv, index=False, sep="\t", encoding="utf-8-sig")

print("saved:", tp_csv)
print("saved:", tp_tsv)

display(tp_top)

print("sanity check (should be 1..K):",
      int(tp_top["gmm_state_1based"].min()),
      int(tp_top["gmm_state_1based"].max()))


## 追加1： “強い境界”だけ版（entropy上位 AND margin小）

- boundaryness（entropy）が高く、かつ top1/top2 の差（margin）が小さい点は  
  **「2状態で本当に割れている」**＝解釈しやすい境界になりやすいです。
- ここでは **entropyが上位ALPHA%** かつ **margin <= strong_margin_max** を満たす点だけ抽出します。


In [None]:
# ============================================================
# 9A) Strong boundaries: (entropy top ALPHA%) AND (margin small)
# ============================================================
entropy_thr = float(np.quantile(entropy, 1.0 - CFG.alpha))
strong_mask = (entropy >= entropy_thr) & (margin <= CFG.strong_margin_max)
strong_idx = np.where(strong_mask)[0]

print("entropy_thr (top ALPHA%):", entropy_thr)
print("strong_margin_max:", CFG.strong_margin_max)
print("strong boundary count:", int(len(strong_idx)))

# 0件になったら、現実的に使えるように「entropy降順・margin昇順で上位n_top」をフォールバック
if len(strong_idx) == 0:
    print("[fallback] no strong boundaries found -> take top n_top by (entropy desc, margin asc)")
    tmp = pd.DataFrame({"i": np.arange(len(df)), "entropy": entropy, "margin": margin})
    tmp = tmp.sort_values(["entropy", "margin"], ascending=[False, True]).head(n_top)
    strong_idx = tmp["i"].astype(int).to_numpy()

# Strong boundary report (same schema as boundary_report.csv)
rows2 = []
missing_json2 = 0
for i in strong_idx:
    i = int(i)
    ep = int(df.loc[i, episode_col]) if episode_col else -1
    gs = int(df.loc[i, step_col]) if step_col else i
    tt = df.loc[i, "transition_type"] if "transition_type" in df.columns else ""

    ev = pick_evidence(df.loc[i])
    ev_prev = pick_evidence(df.loc[i - 1]) if i - 1 >= 0 else ""
    ev_next = pick_evidence(df.loc[i + 1]) if i + 1 < len(df) else ""

    s = span_index.get((ep, gs))
    if s is None:
        missing_json2 += 1
        s = {}

    rows2.append({
        "row_index": i,
        "episode": ep,
        "global_step": gs,
        "transition_type": tt,
        "entropy": float(entropy[i]),
        "gmm_state_1based": int(gmm_state_1based[i]),
        "state_change": bool(state_change[i]),
        "top1_state_1based": int(s1[i] + 1),
        "top1_p": float(p1[i]),
        "top2_state_1based": int(s2[i] + 1),
        "top2_p": float(p2[i]),
        "margin": float(margin[i]),
        "evidence_prev": ev_prev,
        "evidence": ev,
        "evidence_next": ev_next,
        "json_span_text_en": pick_text(s, ["span_text_en", "text_en"]),
        "json_span_text_ja": pick_text(s, ["span_text_ja", "text_ja"]),
        "json_evidence_en": pick_text(s, ["evidence_en"]),
        "json_evidence_ja": pick_text(s, ["evidence_ja"]),
    })

strong_rep = pd.DataFrame(rows2)
if CFG.shorten_text:
    for c in ["evidence_prev", "evidence", "evidence_next", "json_span_text_en", "json_span_text_ja"]:
        if c in strong_rep.columns:
            strong_rep[c] = strong_rep[c].astype(str).map(lambda x: shorten(x, CFG.maxlen))

strong_rep = strong_rep.sort_values(["entropy", "margin"], ascending=[False, True]).copy()

strong_rep_path = out_dir / "strong_boundary_report.csv"
strong_rep.to_csv(strong_rep_path, index=False, encoding="utf-8-sig")
print("saved:", strong_rep_path, "| json missing:", missing_json2)

display(strong_rep.head(20))

# Paper-ready TopK table for strong boundaries
strong_tp = strong_rep.copy()
strong_tp = strong_tp.rename(columns={"evidence": "evidence_best"})
strong_tp = strong_tp[[
    "episode","global_step","transition_type",
    "entropy","gmm_state_1based",
    "top1_state_1based","top1_p",
    "top2_state_1based","top2_p",
    "margin","evidence_best"
]].copy()

strong_tp = strong_tp.sort_values(["entropy","margin"], ascending=[False, True]).head(CFG.strong_topk).copy()
strong_tp.insert(0, "rank", np.arange(1, len(strong_tp) + 1))

strong_tp_csv = out_dir / "strong_turning_points_top10.csv"
strong_tp_tsv = out_dir / "strong_turning_points_top10.tsv"
strong_tp.to_csv(strong_tp_csv, index=False, encoding="utf-8-sig")
strong_tp.to_csv(strong_tp_tsv, index=False, sep="\t", encoding="utf-8-sig")
print("saved:", strong_tp_csv)
print("saved:", strong_tp_tsv)
display(strong_tp)


## 追加2： 遷移点（state_change）優先版（MAPが変わった点のみランキング）

- `gmm_state = argmax(resp)` の **MAP状態が前後で変わった点**だけに絞ってランキングします。
- ランキングは **entropy降順**、同点なら **margin昇順**（=より混合が強い点を優先）です。


In [None]:
# ============================================================
# 9B) Transition points: state_change == True only
# ============================================================
trans_idx = np.where(state_change)[0]
print("transition points (state_change=True):", int(len(trans_idx)))

trans_tp = tp.loc[trans_idx].copy()
trans_tp = trans_tp.sort_values(["entropy", "margin"], ascending=[False, True]).head(CFG.transition_topk).copy()
trans_tp.insert(0, "rank", np.arange(1, len(trans_tp) + 1))

trans_csv = out_dir / "transition_points_top10.csv"
trans_tsv = out_dir / "transition_points_top10.tsv"
trans_tp.to_csv(trans_csv, index=False, encoding="utf-8-sig")
trans_tp.to_csv(trans_tsv, index=False, sep="\t", encoding="utf-8-sig")

print("saved:", trans_csv)
print("saved:", trans_tsv)
display(trans_tp)


In [None]:
# ============================================================
# 10) Quick sanity summary (optional)
# ============================================================
summary = {
    "N": int(len(df)),
    "K": int(CFG.k),
    "alpha": float(CFG.alpha),
    "boundary_n": int(len(rep)),
    "entropy_max": float(entropy.max()),
    "entropy_p95": float(np.quantile(entropy, 0.95)),
    "entropy_median": float(np.median(entropy)),
    "state_change_rate": float(state_change.mean()),
    "boundary_state_change_rate": float(rep["state_change"].mean()),
    "json_missing_rate_in_boundary": float(missing_json / max(1, len(rep))),
}
pd.DataFrame([summary])


## 出力物

- `out/boundary_report.csv`  
  entropy上位(ALPHA%)の候補。前後evidence、top2 states、margin、JSON側span_text付き。

- `out/turning_points_top10.csv` / `out/turning_points_top10.tsv`  
  本文に貼る用の TopK 表。

## GitHubに上げるときのおすすめ

- ノートブック名（例）: `01_ulysses_boundary_report.ipynb`
- 依存を減らすため、外部自作モジュール無しで完結させています。
- データファイルは `.gitignore` で除外し、READMEに「Colabでアップロードして動かす」手順を書くのが安全です。
