In [1]:
# -*- coding: utf-8 -*-
import os, json, torch
import numpy as np
import torch.nn.functional as F
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from typing import Dict, Any
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from peft import PeftModel
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score
from sklearn.preprocessing import label_binarize

# ================== 配置 ==================
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"
MODEL_DIR    = "qwen-test-on-pandora_new"                # 你的模型主目录
ADAPTER_DIR  = os.path.join(MODEL_DIR, "lora_adapter")   # LoRA 适配器路径

VAL_PATH     = "val.json"     # 新的验证集路径（你改这里）
TEST_PATH    = "test.json"      # 新的测试集路径（你改这里）

OUTPUT_DIR   = "eval_new_val_test"  # 输出目录（混淆矩阵、ROC）

MAX_LEN      = 384
BATCH_SIZE   = 16

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# ================== 工具函数 ==================
def truncate_to_budget(tok, text, budget):
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

BUDGET = {"posts_cleaned": 192, "semantic_view": 64, "sentiment_view": 32, "linguistic_view": 24}

def build_input(item: Dict[str, Any], tok) -> str:
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

def plot_confusion_and_roc(y_true, y_prob, class_names, out_dir, tag="eval"):
    os.makedirs(out_dir, exist_ok=True)
    y_pred = np.argmax(y_prob, axis=-1)
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 8), dpi=150)
    disp.plot(ax=ax_cm, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax_cm.set_title(f"Confusion Matrix ({tag})")
    fig_cm.tight_layout()
    fig_cm.savefig(os.path.join(out_dir, f"{tag}_confusion_matrix.png"))
    plt.close(fig_cm)

    # ROC
    Y_true_bin = label_binarize(y_true, classes=list(range(len(class_names))))
    fpr, tpr, roc_auc = {}, {}, {}
    valid = []
    for i in range(len(class_names)):
        if Y_true_bin[:, i].sum() == 0:
            continue
        fpr[i], tpr[i], _ = roc_curve(Y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        valid.append(i)
    if len(valid) >= 2:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            Y_true_bin[:, valid].ravel(), y_prob[:, valid].ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in valid]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in valid:
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= len(valid)
        fpr["macro"] = all_fpr; tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        fig_roc, ax_roc = plt.subplots(figsize=(7, 7), dpi=150)
        ax_roc.plot(fpr["micro"], tpr["micro"],
                    label=f"micro-average ROC (AUC = {roc_auc['micro']:.3f})", linewidth=2)
        ax_roc.plot(fpr["macro"], tpr["macro"],
                    label=f"macro-average ROC (AUC = {roc_auc['macro']:.3f})", linewidth=2)
        ax_roc.plot([0, 1], [0, 1], "k--", linewidth=1)
        ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
        ax_roc.set_xlabel("False Positive Rate"); ax_roc.set_ylabel("True Positive Rate")
        ax_roc.set_title(f"Multiclass ROC ({tag})")
        ax_roc.legend(loc="lower right")
        fig_roc.tight_layout()
        fig_roc.savefig(os.path.join(out_dir, f"{tag}_roc_micro_macro.png"))
        plt.close(fig_roc)

# ================== 主流程 ==================
def main():
    # Tokenizer
    tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    # 加载基座 + LoRA
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True)
    base_cfg.num_labels = len(MBTI_16)
    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, config=base_cfg, device_map="auto", trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base, ADAPTER_DIR)
    model = model.merge_and_unload()  # 合并权重，推理更快
    model.eval()

    # 构建数据集
    val_rows  = load_rows(VAL_PATH)
    test_rows = load_rows(TEST_PATH)
    val_ds  = MBTIDataset(val_rows,  tok, max_len=MAX_LEN)
    test_ds = MBTIDataset(test_rows, tok, max_len=MAX_LEN)
    collator = DataCollatorWithPadding(tok, pad_to_multiple_of=8)

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_eval_batch_size=BATCH_SIZE,
        report_to="none"
    )
    trainer = Trainer(model=model, args=args, tokenizer=tok, data_collator=collator)

    # 验证集评测
    val_out   = trainer.predict(val_ds)
    val_logits = val_out.predictions[0] if isinstance(val_out.predictions, (list, tuple)) else val_out.predictions
    val_probs = F.softmax(torch.tensor(val_logits, dtype=torch.float32), dim=-1).cpu().numpy()
    val_true  = val_out.label_ids
    val_pred  = np.argmax(val_probs, axis=-1)
    val_acc   = accuracy_score(val_true, val_pred)
    print(f"\n=== Validation Accuracy: {val_acc:.4f}")
    plot_confusion_and_roc(val_true, val_probs, MBTI_16, OUTPUT_DIR, tag="new_val")

    # 测试集评测
    test_out   = trainer.predict(test_ds)
    test_logits = test_out.predictions[0] if isinstance(test_out.predictions, (list, tuple)) else test_out.predictions
    test_probs = F.softmax(torch.tensor(test_logits, dtype=torch.float32), dim=-1).cpu().numpy()
    test_true  = test_out.label_ids
    test_pred  = np.argmax(test_probs, axis=-1)
    test_acc   = accuracy_score(test_true, test_pred)
    print(f"=== Test Accuracy: {test_acc:.4f}")
    plot_confusion_and_roc(test_true, test_probs, MBTI_16, OUTPUT_DIR, tag="new_test")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args, tokenizer=tok, data_collator=collator)


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

In [None]:
# -*- coding: utf-8 -*-
import os, json, torch
import numpy as np
import torch.nn.functional as F
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding
from peft import PeftModel
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# ===== 路径配置 =====
BASE_MODEL   = "Qwen/Qwen2.5-1.5B-Instruct"     # 训练用的基座
MODEL_DIR    = "qwen-test-on-pandora_new"       # 你的模型目录
ADAPTER_DIR  = os.path.join(MODEL_DIR, "lora_adapter")  # LoRA adapter
TEST_PATH    = "test对应的原始数据.json"     # 你要评测的新测试集
OUTPUT_DIR   = "eval_only_test"

MAX_LEN    = 384
BATCH_SIZE = 16

MBTI_16 = [
    "INTJ","INTP","ENTJ","ENTP","INFJ","INFP","ENFJ","ENFP",
    "ISTJ","ISFJ","ESTJ","ESFJ","ISTP","ISFP","ESTP","ESFP"
]
MBTI2ID = {t:i for i,t in enumerate(MBTI_16)}

# ===== 数据处理函数 =====
def truncate_to_budget(tok, text, budget):
    enc = tok(text or "", add_special_tokens=False)
    ids = enc["input_ids"][: budget]
    return tok.decode(ids)

BUDGET = {"posts_cleaned":192,"semantic_view":64,"sentiment_view":32,"linguistic_view":24}

def build_input(item, tok):
    p_raw = item.get("posts_cleaned", item.get("posts","")) or item.get("text","") or ""
    sem   = item.get("semantic_view","")  or ""
    sen   = item.get("sentiment_view","") or ""
    lin   = item.get("linguistic_view","") or ""

    p   = truncate_to_budget(tok, p_raw, BUDGET["posts_cleaned"])
    sem = truncate_to_budget(tok, sem,   BUDGET["semantic_view"])
    sen = truncate_to_budget(tok, sen,   BUDGET["sentiment_view"])
    lin = truncate_to_budget(tok, lin,   BUDGET["linguistic_view"])

    return (
        f"[POSTS]\n{p}\n[SEMANTIC]\n{sem}\n[SENTIMENT]\n{sen}\n[LINGUISTIC]\n{lin}\n"
        f"[TASK] Predict MBTI type among {', '.join(MBTI_16)}."
    )

def load_rows(path: str):
    with open(path, "r", encoding="utf-8") as f:
        rows = json.load(f)
    clean = []
    for r in rows:
        t = (r.get("type") or r.get("label") or "").upper().strip()
        if t in MBTI2ID:
            r["type"] = t
            clean.append(r)
    return clean

class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, rows, tokenizer, max_len=512):
        self.rows = rows
        self.tok  = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx):
        it   = self.rows[idx]
        text = build_input(it, self.tok)
        y    = MBTI2ID[it["type"]]
        enc  = self.tok(text, truncation=True, max_length=self.max_len)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": y}

# ===== 混淆矩阵可视化 =====
def plot_confusion(y_true, y_pred, class_names, out_dir, tag="test"):
    os.makedirs(out_dir, exist_ok=True)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig, ax = plt.subplots(figsize=(8,8), dpi=150)
    disp.plot(ax=ax, xticks_rotation=45, cmap="Blues", colorbar=False)
    ax.set_title(f"Confusion Matrix ({tag})")
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{tag}_confusion_matrix.png"))
    plt.close(fig)

# ===== 主流程 =====
def main():
    # Tokenizer
    tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    # 加载基座 + LoRA
    base_cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True)
    base_cfg.num_labels = len(MBTI_16)
    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, config=base_cfg, device_map="auto", trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base, ADAPTER_DIR)
    model = model.merge_and_unload()
    model.config.pad_token_id = tok.pad_token_id  # 关键修复
    model.eval()

    # 数据
    test_rows = load_rows(TEST_PATH)
    test_ds   = MBTIDataset(test_rows, tok, max_len=MAX_LEN)

    # 推理
    all_logits, all_labels = [], []
    for batch in torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False,
                                             collate_fn=DataCollatorWithPadding(tok, pad_to_multiple_of=8)):
        batch = {k:v.to(model.device) for k,v in batch.items()}
        with torch.no_grad():
            out = model(**batch).logits
        all_logits.append(out.cpu())
        all_labels.append(batch["labels"].cpu())
    logits = torch.cat(all_logits).numpy()
    y_true = torch.cat(all_labels).numpy()
    y_pred = np.argmax(logits, axis=-1)

    # 结果
    acc = accuracy_score(y_true, y_pred)
    print(f"=== Test Accuracy on {TEST_PATH}: {acc:.4f}")
    plot_confusion(y_true, y_pred, MBTI_16, OUTPUT_DIR, tag="new_test")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== Test Accuracy on test对应的原始数据.json: 0.8054


: 

In [3]:
# add_case_ids.py
# -*- coding: utf-8 -*-
import json, os, hashlib

CASEBANK_IN  = "casebank_A_train_80_with_embeddings.json"   # 你的 casebank embedding 文件
CASEBANK_OUT = "casebank_A_train_80_with_ids.json"          # 输出：补了 case_id 的文件

def canon_key(text: str) -> str:
    """把文本规整成稳定键（去空白/大小写）再哈希，避免微小差异导致匹配失败。"""
    t = (text or "").strip().lower()
    t = " ".join(t.split())          # 折叠多空格/换行
    return hashlib.md5(t.encode("utf-8")).hexdigest()

with open(CASEBANK_IN, "r", encoding="utf-8") as f:
    bank = json.load(f)

for i, item in enumerate(bank):
    item["case_id"] = i  # 用顺序 index 当稳定 id

# 额外生成：文本->id 的查找表，方便下一步匹配
id_lookup = {}
for it in bank:
    key = canon_key(it.get("post_casebank") or it.get("embed_text") or it.get("posts_cleaned") or it.get("posts") or "")
    if key:
        id_lookup[key] = it["case_id"]

with open(CASEBANK_OUT, "w", encoding="utf-8") as f:
    json.dump(bank, f, ensure_ascii=False, indent=2)

with open("casebank_text2id.json", "w", encoding="utf-8") as f:
    json.dump(id_lookup, f, ensure_ascii=False, indent=2)

print(f"✅ 已写出：{CASEBANK_OUT}  和  casebank_text2id.json（文本->id 映射）")


✅ 已写出：casebank_A_train_80_with_ids.json  和  casebank_text2id.json（文本->id 映射）


In [1]:
# count_case_usage_prefix_exact.py
# -*- coding: utf-8 -*-
import json, csv, re
from collections import Counter, defaultdict

# ===== 路径（改成你的）=====
CASEBANK_FILE = "casebank_A_train_80_with_embeddings.json"  # 含 posts_cleaned 的 casebank
TOPK_FILE     = "A_test_top3.json"                          # topk=3 结果文件
OUT_COUNTS    = "case_usage_counts_prefix.csv"
OUT_PAIRS     = "test_to_cases_with_ids_prefix.jsonl"

# ===== 字段（按你的数据）=====
BANK_TEXT   = "posts_cleaned"     # casebank 的原文字段
TOPK_LIST   = "topk_cases"        # topk 列表字段
TOPK_TEXT   = "post_casebank"     # topk 每条候选里的原文字段
K           = 3                   # top-k

# ===== 匹配策略参数 =====
PREFIX_START = 80     # 诊断已证实：80 能全覆盖
PREFIX_STEP  = 40     # 不唯一时，每次增加的前缀长度
MAX_PREFIX   = 2000   # 前缀最长检查到多少字符（防止极端长文本）
CHOOSE_FIRST_IF_AMBIG = False  # True=在仍不唯一时取第一个；False=跳过以保证严格

# --- 文本归一化：只折叠空白，不改大小写 ---
WS = re.compile(r"\s+")
def norm_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())

# ---------- 读取 casebank，确保有 case_id ----------
with open(CASEBANK_FILE, "r", encoding="utf-8") as f:
    bank = json.load(f)

added_ids = 0
for i, it in enumerate(bank):
    if "case_id" not in it:
        it["case_id"] = i
        added_ids += 1

# 预计算：规范化全文 & 建立“前缀->候选id列表”的索引（只建起始前缀，后续按需收缩候选）
norm_text_by_id = {}
prefix_index = defaultdict(list)  # key = norm_text[:PREFIX_START] -> [case_id,...]

empty_cnt = 0
for it in bank:
    t = it.get(BANK_TEXT, "")
    if not isinstance(t, str) or not t:
        empty_cnt += 1
        continue
    nt = norm_space(t)
    norm_text_by_id[it["case_id"]] = nt
    key0 = nt[:PREFIX_START]
    prefix_index[key0].append(it["case_id"])

print(f"[CaseBank] 条数={len(bank)}；新增case_id={added_ids}；可索引文本={len(norm_text_by_id)}；空文本={empty_cnt}")
print(f"[Index] 唯一前缀键数={len(prefix_index)}")

# ---------- 读取 topk ----------
with open(TOPK_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

# ---------- 匹配函数：空白折叠 + 递增前缀直到唯一 ----------
def resolve_case_id(raw_text: str):
    if not isinstance(raw_text, str) or not raw_text:
        return None, "empty"
    nt = norm_space(raw_text)
    pref_len = min(PREFIX_START, len(nt), MAX_PREFIX)
    # 初始候选：用起始前缀在索引里查
    cand = prefix_index.get(nt[:pref_len], [])
    if not cand:
        return None, "no_prefix_key"

    # 若只有一个候选，直接返回
    if len(cand) == 1:
        return cand[0], "ok@start"

    # 尝试递增前缀长度筛掉不匹配项
    while pref_len < min(len(nt), MAX_PREFIX):
        pref_len = min(pref_len + PREFIX_STEP, len(nt), MAX_PREFIX)
        pref = nt[:pref_len]
        cand = [cid for cid in cand if norm_text_by_id[cid].startswith(pref)]
        if len(cand) <= 1:
            break

    if len(cand) == 1:
        return cand[0], f"ok@{pref_len}"
    else:
        # 仍不唯一
        if CHOOSE_FIRST_IF_AMBIG and len(cand) > 0:
            return cand[0], f"ambig_choose_first@{pref_len}"
        return None, f"ambig_skip@{pref_len}"

# ---------- 统计 ----------
M = len(data)
total_slots = 0
matched = 0
ambig = 0
miss  = 0
counter = Counter()
pairs  = []

# 统计各匹配路径（便于排查）
route_counter = Counter()

for i, rec in enumerate(data):
    ids = []
    for c in rec.get(TOPK_LIST, [])[:K]:
        total_slots += 1
        cid, route = resolve_case_id(c.get(TOPK_TEXT, ""))
        route_counter[route] += 1
        if cid is None:
            ids.append(None)
            if route.startswith("ambig"): ambig += 1
            else: miss += 1
        else:
            ids.append(int(cid))
            counter[cid] += 1
            matched += 1
    pairs.append({"test_index": i, "topk_case_ids": ids})

cov = matched / total_slots if total_slots else 0.0
print(f"[覆盖率] matched={matched}, ambig={ambig}, miss={miss}, total={total_slots} → {cov:.2%}")
print("[匹配路径统计]（用于诊断）")
for k, v in route_counter.most_common():
    print(f"  {k:>22}: {v}")

# ---------- 输出 ----------
with open(OUT_COUNTS, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["case_id", "hits", "hit_rate_per_query", "hit_rate_over_slots"])
    for cid, hits in sorted(counter.items(), key=lambda x: x[1], reverse=True):
        w.writerow([cid, hits, hits / M if M else 0.0, hits / total_slots if total_slots else 0.0])

with open(OUT_PAIRS, "w", encoding="utf-8") as f:
    for row in pairs:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"✅ 输出：{OUT_COUNTS} / {OUT_PAIRS}")


[CaseBank] 条数=27224；新增case_id=27224；可索引文本=27223；空文本=1
[Index] 唯一前缀键数=27120
[覆盖率] matched=10212, ambig=0, miss=0, total=10212 → 100.00%
[匹配路径统计]（用于诊断）
                ok@start: 10082
                  ok@120: 130
✅ 输出：case_usage_counts_prefix.csv / test_to_cases_with_ids_prefix.jsonl


In [2]:
# count_all_case_usage.py
# -*- coding: utf-8 -*-
import json, csv
from collections import Counter, defaultdict

# ======= 配置 =======
PAIRS_FILE    = "test_to_cases_with_ids_prefix.jsonl"   # 或 test_to_cases_with_ids_exact.jsonl
CASEBANK_FILE = "casebank_A_train_80_with_embeddings.json"  # 可选：若提供会输出 type/文本片段
TEXT_FIELD    = "posts_cleaned"     # 用于展示的文本字段（仅在提供 CASEBANK_FILE 时生效）
SHOW_TOPN     = 10                  # 终端打印前N名
OUT_CSV       = "case_usage_all.csv"

# ======= 读取 pairs（每行：{"test_index": i, "topk_case_ids": [cid0,cid1,cid2]}）=======
pairs = [json.loads(l) for l in open(PAIRS_FILE, "r", encoding="utf-8")]
M = len(pairs)
lists = [r["topk_case_ids"] for r in pairs]
total_slots = sum(len(x) for x in lists)

# 扁平化命中（忽略 None）
flat_ids = [cid for lst in lists for cid in lst if cid is not None]
hits_counter = Counter(flat_ids)

# 每个 case 被多少“不同样本”命中（去重）
unique_queries_counter = Counter()
for lst in lists:
    uniq = {cid for cid in lst if cid is not None}
    unique_queries_counter.update(uniq)

# Top1 命中次数 & 各位置分布
top1_counter = Counter()
pos_counters = defaultdict(Counter)  # pos_counters[cid][pos] 计数
for lst in lists:
    if not lst: continue
    for pos, cid in enumerate(lst):
        if cid is None: continue
        pos_counters[cid][pos] += 1
    cid0 = lst[0]
    if cid0 is not None:
        top1_counter[cid0] += 1

# ======= （可选）读取 casebank 做展示增强 =======
cid2type, cid2text = {}, {}
if CASEBANK_FILE:
    with open(CASEBANK_FILE, "r", encoding="utf-8") as f:
        bank = json.load(f)
    for i, it in enumerate(bank):
        cid = it.get("case_id", i)           # 若没写case_id，用顺序索引兜底
        cid2type[cid] = it.get("type", "")
        txt = it.get(TEXT_FIELD, "") or it.get("posts", "") or ""
        cid2text[cid] = (txt[:80].replace("\n", " ") if isinstance(txt, str) else "")

# ======= 写出总表（按 hits 降序）=======
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    header = ["case_id","hits","unique_queries","top1_hits","pos0","pos1","pos2",
              "rate_over_slots","rate_per_query","top1_rate"]
    if CASEBANK_FILE:
        header += ["type","text_head"]
    w.writerow(header)

    # 所有出现过的 id（也可以选择把未出现过的 id 一并写出，命中为0）
    all_ids = sorted(hits_counter.keys(), key=lambda x: hits_counter[x], reverse=True)

    for cid in all_ids:
        hits = hits_counter[cid]
        uq   = unique_queries_counter.get(cid, 0)
        t1   = top1_counter.get(cid, 0)
        p0   = pos_counters[cid].get(0, 0)
        p1   = pos_counters[cid].get(1, 0)
        p2   = pos_counters[cid].get(2, 0)

        rate_over_slots = hits / total_slots if total_slots else 0.0
        rate_per_query  = uq   / M if M else 0.0
        top1_rate       = t1   / M if M else 0.0

        row = [cid, hits, uq, t1, p0, p1, p2, rate_over_slots, rate_per_query, top1_rate]
        if CASEBANK_FILE:
            row += [cid2type.get(cid, ""), cid2text.get(cid, "")]
        w.writerow(row)

print(f"✅ 已写出：{OUT_CSV}")
print(f"样本数 M={M}, 总槽位={total_slots}")

# ======= 终端快速查看 TopN =======
print("\nTop cases by hits:")
for i, cid in enumerate(sorted(hits_counter.keys(), key=lambda x: hits_counter[x], reverse=True)[:SHOW_TOPN], 1):
    hits = hits_counter[cid]
    uq   = unique_queries_counter.get(cid, 0)
    t1   = top1_counter.get(cid, 0)
    p0   = pos_counters[cid].get(0, 0)
    p1   = pos_counters[cid].get(1, 0)
    p2   = pos_counters[cid].get(2, 0)
    print(f"{i:>2}. cid={cid:<6} hits={hits:<5} uq={uq:<5} top1={t1:<5} pos[0/1/2]=({p0}/{p1}/{p2})")


✅ 已写出：case_usage_all.csv
样本数 M=3404, 总槽位=10212

Top cases by hits:
 1. cid=5851   hits=13    uq=13    top1=4     pos[0/1/2]=(4/5/4)
 2. cid=6277   hits=11    uq=11    top1=7     pos[0/1/2]=(7/1/3)
 3. cid=10021  hits=10    uq=10    top1=6     pos[0/1/2]=(6/4/0)
 4. cid=18467  hits=9     uq=9     top1=3     pos[0/1/2]=(3/2/4)
 5. cid=6704   hits=9     uq=9     top1=2     pos[0/1/2]=(2/1/6)
 6. cid=1866   hits=9     uq=9     top1=2     pos[0/1/2]=(2/6/1)
 7. cid=20808  hits=9     uq=9     top1=3     pos[0/1/2]=(3/2/4)
 8. cid=3792   hits=9     uq=9     top1=3     pos[0/1/2]=(3/3/3)
 9. cid=10748  hits=8     uq=8     top1=7     pos[0/1/2]=(7/0/1)
10. cid=18716  hits=8     uq=8     top1=4     pos[0/1/2]=(4/2/2)


In [4]:
# cluster_and_highlight_from_train_topk.py
# -*- coding: utf-8 -*-
import json, csv, re, os
from collections import Counter, defaultdict
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

# ========= 路径（按需修改）=========
TRAIN_FILE   = "train.json"                                   # 权威原文库
TOPK_FILE    = "A_test_top3.json"                             # top-k 结果
EMB_FILE     = "casebank_A_train_80_with_embeddings.json"     # 含 embedding（已算好的）
OUT_PAIRS    = "pairs_case_ids.jsonl"                         # 每个样本命中的 case_id 列表
OUT_COUNTS   = "case_usage_all.csv"                           # 每个 id 的使用统计
OUT_MERGED   = "case_meta_with_hits.json"                     # id + type + text + hits + embedding
OUT_PLOT     = "cluster_highlight.png"                        # 聚类散点图

# ========= 字段名 =========
TRAIN_TEXT_FIELD = "posts_cleaned"    # train.json 用于对齐的文本字段
TOPK_LIST_FIELD  = "topk_cases"       # topk 列表字段
TOPK_TEXT_FIELD  = "post_casebank"    # topk 候选里的文本字段
TOPK_K           = 3

# ========= 匹配模式（与你的诊断一致）=========
# 'raw' 精确；'space' 仅折叠空白；'prefix80_space' 折叠空白后取前80字精确
MATCH_MODE = "prefix80_space"

_WS = re.compile(r"\s+")
def key_raw(s: str) -> str: return s
def key_space(s: str) -> str: return _WS.sub(" ", (s or "").strip())
def key_prefix80_space(s: str) -> str: return _WS.sub(" ", (s or "").strip())[:80]
KEY = {"raw": key_raw, "space": key_space, "prefix80_space": key_prefix80_space}[MATCH_MODE]

# ========= Step 1. 用 train 建立 文本->case_id 索引 =========
with open(TRAIN_FILE, "r", encoding="utf-8") as f:
    train = json.load(f)
N = len(train)

text2id = {}
id2type, id2text = {}, {}
dup_in_train = empty_in_train = 0
for i, it in enumerate(train):
    t = it.get(TRAIN_TEXT_FIELD, "")
    if not isinstance(t, str) or not t:
        empty_in_train += 1
        continue
    k = KEY(t)
    if k not in text2id:
        text2id[k] = i
    else:
        dup_in_train += 1
    id2type[i] = it.get("type", "")
    id2text[i] = t

print(f"[train] 总={N}，键数={len(text2id)}，重复文={dup_in_train}，空文本={empty_in_train}")

# ========= Step 2. 映射 topk -> case_id =========
with open(TOPK_FILE, "r", encoding="utf-8") as f:
    topk_data = json.load(f)

pairs = []
M = len(topk_data)
total_slots = matched = miss = 0
for i, rec in enumerate(topk_data):
    ids = []
    for c in rec.get(TOPK_LIST_FIELD, [])[:TOPK_K]:
        total_slots += 1
        t = c.get(TOPK_TEXT_FIELD, "")
        cid = text2id.get(KEY(t)) if isinstance(t, str) and t else None
        if cid is None:
            miss += 1; ids.append(None)
        else:
            matched += 1; ids.append(int(cid))
    pairs.append({"test_index": i, "topk_case_ids": ids})

with open(OUT_PAIRS, "w", encoding="utf-8") as f:
    for row in pairs:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")
print(f"[映射topk] 覆盖率={matched/total_slots:.2%}  (matched={matched}, total={total_slots})")
print(f"→ 写出 {OUT_PAIRS}")

# ========= Step 3. 对所有 id 统计使用次数/命中率 =========
lists = [r["topk_case_ids"] for r in pairs]
flat_ids = [cid for lst in lists for cid in lst if cid is not None]

hits_counter = Counter(flat_ids)      # 槽位计数
unique_queries_counter = Counter()    # 被多少个不同样本命中
top1_counter = Counter()
pos_counters = defaultdict(Counter)

for lst in lists:
    uniq = {cid for cid in lst if cid is not None}
    unique_queries_counter.update(uniq)
    if lst and lst[0] is not None:
        top1_counter[lst[0]] += 1
    for pos, cid in enumerate(lst):
        if cid is not None:
            pos_counters[cid][pos] += 1

total_slots = sum(len(x) for x in lists)

with open(OUT_COUNTS, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["case_id","hits","unique_queries","top1_hits","pos0","pos1","pos2",
                "rate_over_slots","rate_per_query","top1_rate","type","text_head"])
    for cid in range(N):
        h  = hits_counter.get(cid, 0)
        uq = unique_queries_counter.get(cid, 0)
        t1 = top1_counter.get(cid, 0)
        p0 = pos_counters[cid].get(0, 0)
        p1 = pos_counters[cid].get(1, 0)
        p2 = pos_counters[cid].get(2, 0)
        w.writerow([
            cid, h, uq, t1, p0, p1, p2,
            h/total_slots if total_slots else 0.0,
            uq/M if M else 0.0,
            t1/M if M else 0.0,
            id2type.get(cid, ""),
            (id2text.get(cid, "")[:80].replace("\n"," "))
        ])
print(f"→ 写出 {OUT_COUNTS}")

# ========= Step 4. 合并 embedding，准备做聚类图 =========
# 说明：聚类/降维需要向量，这里直接复用你已有的 embedding 文件
with open(EMB_FILE, "r", encoding="utf-8") as f:
    emb_rows = json.load(f)

# 建立 key -> embedding 映射（与上面同一 KEY 规则）
key2emb = {}
for it in emb_rows:
    t = it.get(TRAIN_TEXT_FIELD, "") or it.get("posts", "")
    if isinstance(t, str) and t and "embedding" in it:
        k = KEY(t)
        key2emb[k] = it["embedding"]

# 组装：cid、type、text、hits、embedding
merged = []
miss_emb = 0
for cid in range(N):
    t = id2text.get(cid, "")
    k = KEY(t)
    emb = key2emb.get(k)
    if emb is None:
        miss_emb += 1
        continue
    merged.append({
        "case_id": cid,
        "type": id2type.get(cid, ""),
        "text": t,
        "hits": hits_counter.get(cid, 0),
        "embedding": emb
    })

with open(OUT_MERGED, "w", encoding="utf-8") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)
print(f"→ 合并完成：{len(merged)} 条（缺失向量 {miss_emb} 条），写出 {OUT_MERGED}")

# ========= Step 5. 降维 + 绘图（高使用高亮）=========
# 准备矩阵
X = np.array([m["embedding"] for m in merged], dtype=np.float32)
hits = np.array([m["hits"] for m in merged], dtype=np.int32)
types = [m["type"] for m in merged]
cids  = [m["case_id"] for m in merged]

# 先 PCA->50，再尝试 UMAP；没有 UMAP 就退化到 PCA2 或 TSNE(可能慢)
Z_2d = None
try:
    import umap
    X50 = PCA(n_components=50, random_state=0).fit_transform(X)
    reducer = umap.UMAP(n_neighbors=30, min_dist=0.1, metric="cosine", random_state=0)
    Z_2d = reducer.fit_transform(X50)
except Exception:
    try:
        Z_2d = PCA(n_components=2, random_state=0).fit_transform(X)
    except Exception:
        from sklearn.manifold import TSNE
        Z_2d = TSNE(n_components=2, random_state=0, perplexity=30, init="pca").fit_transform(X)

# 高亮规则：命中 TopN 或 hits >= 分位数阈值
TOPN_HIGHLIGHT = 300
Q_PERCENTILE   = 99  # 或设为 None 只用 TopN
thr = np.percentile(hits, Q_PERCENTILE) if Q_PERCENTILE is not None else None
order = np.argsort(-hits)
mask_high = np.zeros_like(hits, dtype=bool)
mask_high[order[:TOPN_HIGHLIGHT]] = True
if thr is not None:
    mask_high |= (hits >= thr)

# 绘图
plt.figure(figsize=(9, 7), dpi=150)
# 背景（低使用）
plt.scatter(Z_2d[~mask_high, 0], Z_2d[~mask_high, 1],
            s=5, alpha=0.25, linewidths=0, label="others")
# 高使用
plt.scatter(Z_2d[mask_high, 0], Z_2d[mask_high, 1],
            s=18, alpha=0.9, linewidths=0.5, edgecolors="k", label="high-usage")

plt.title("Casebank clustering (high-usage highlighted)")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig(OUT_PLOT)
print(f"✅ 聚类图已保存：{OUT_PLOT}")

#（可选）标注前几十个典型点
ANNOTATE_TOPK = 40
for idx in order[:ANNOTATE_TOPK]:
    x, y = Z_2d[idx]
    lbl = f"{cids[idx]} | {types[idx]}"
    plt.text(x, y, lbl, fontsize=6)
plt.tight_layout()
plt.savefig(OUT_PLOT.replace(".png","_annotated.png"))
print(f"✅ 已保存带标注版本：{OUT_PLOT.replace('.png','_annotated.png')}")


[train] 总=27224，键数=27120，重复文=103，空文本=1
[映射topk] 覆盖率=100.00%  (matched=10212, total=10212)
→ 写出 pairs_case_ids.jsonl
→ 写出 case_usage_all.csv
→ 合并完成：27223 条（缺失向量 1 条），写出 case_meta_with_hits.json
✅ 聚类图已保存：cluster_highlight.png
✅ 已保存带标注版本：cluster_highlight_annotated.png


In [14]:
# print_topN_with_post.py
# -*- coding: utf-8 -*-
import json, csv

COUNTS_FILE = "case_usage_all.csv"   # 上一步输出
TRAIN_FILE  = "train.json"           # 权威原文库
TRAIN_TEXT_FIELD = "posts_cleaned"   # 如需换字段，改这里

TOPN = 10                            # 要查看的前N
EXPORT_ALL_JSONL = True              # 是否导出所有id到 JSONL（含完整post）
ALL_JSONL_FILE   = "all_cases_with_post.jsonl"
TOPN_TXT_FILE    = "topN_cases_with_post.txt"

# 1) 读 train.json，建立 id -> post / type
with open(TRAIN_FILE, "r", encoding="utf-8") as f:
    train = json.load(f)
id2post = {}
id2type = {}
for i, it in enumerate(train):
    id2post[i] = it.get(TRAIN_TEXT_FIELD, "") or it.get("posts", "") or ""
    id2type[i] = it.get("type", "")

# 2) 读统计表
rows = []
with open(COUNTS_FILE, "r", encoding="utf-8") as f:
    r = csv.DictReader(f)
    for row in r:
        row["case_id"] = int(row["case_id"])
        row["hits"] = int(row["hits"])
        row["unique_queries"] = int(row["unique_queries"])
        row["top1_hits"] = int(row["top1_hits"])
        row["pos0"] = int(row["pos0"])
        row["pos1"] = int(row["pos1"])
        row["pos2"] = int(row["pos2"])
        rows.append(row)

# 3) 排序：hits desc -> top1_hits desc -> unique_queries desc
rows.sort(key=lambda x: (x["hits"], x["top1_hits"], x["unique_queries"]), reverse=True)

# 4) 终端打印摘要 + 写入带post的TXT
print("Top cases by hits:")
out_lines = ["Top cases by hits:"]
for i, r in enumerate(rows[:TOPN], 1):
    cid = r["case_id"]
    post = id2post.get(cid, "")
    line = (f"{i:>2}. cid={cid}  "
            f"hits={r['hits']}  uq={r['unique_queries']}  top1={r['top1_hits']}  "
            f"pos[0/1/2]=({r['pos0']}/{r['pos1']}/{r['pos2']})")
    print(line)
    out_lines.append(line)
    out_lines.append(f"[type]={id2type.get(cid,'')}")
    out_lines.append(post)          # 完整 posts_cleaned
    out_lines.append("")            # 空行分隔

with open(TOPN_TXT_FILE, "w", encoding="utf-8") as f:
    f.write("\n".join(out_lines))
print(f"✅ 已保存：{TOPN_TXT_FILE}")

# 5) （可选）导出所有id到JSONL，便于后续做可视化/检索
if EXPORT_ALL_JSONL:
    with open(ALL_JSONL_FILE, "w", encoding="utf-8") as f:
        for r in rows:
            cid = r["case_id"]
            rec = {
                "case_id": cid,
                "type": id2type.get(cid, ""),
                "post": id2post.get(cid, ""),
                "hits": r["hits"],
                "unique_queries": r["unique_queries"],
                "top1_hits": r["top1_hits"],
                "pos0": r["pos0"], "pos1": r["pos1"], "pos2": r["pos2"],
                "rate_over_slots": float(r["rate_over_slots"]),
                "rate_per_query": float(r["rate_per_query"]),
                "top1_rate": float(r["top1_rate"]),
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"✅ 已保存：{ALL_JSONL_FILE}")


Top cases by hits:
 1. cid=5851  hits=13  uq=13  top1=4  pos[0/1/2]=(4/5/4)
 2. cid=6277  hits=11  uq=11  top1=7  pos[0/1/2]=(7/1/3)
 3. cid=10021  hits=10  uq=10  top1=6  pos[0/1/2]=(6/4/0)
 4. cid=698  hits=9  uq=8  top1=5  pos[0/1/2]=(5/3/1)
 5. cid=3792  hits=9  uq=9  top1=3  pos[0/1/2]=(3/3/3)
 6. cid=18467  hits=9  uq=9  top1=3  pos[0/1/2]=(3/2/4)
 7. cid=20808  hits=9  uq=9  top1=3  pos[0/1/2]=(3/2/4)
 8. cid=1866  hits=9  uq=9  top1=2  pos[0/1/2]=(2/6/1)
 9. cid=6704  hits=9  uq=9  top1=2  pos[0/1/2]=(2/1/6)
10. cid=10748  hits=8  uq=8  top1=7  pos[0/1/2]=(7/0/1)
✅ 已保存：topN_cases_with_post.txt
✅ 已保存：all_cases_with_post.jsonl


In [6]:
# top4_posts_ISTP_ISFJ_from_jsonl.py
import json

JSONL = "all_cases_with_post.jsonl"
TYPES = ["ISTP", "ISFJ"]
TOPK  = 4

# 读入
items = []
with open(JSONL, "r", encoding="utf-8") as f:
    for line in f:
        items.append(json.loads(line))

# 逐 type 过滤 + 排序 + 取前K
out_lines = []
for t in TYPES:
    sub = [x for x in items if x.get("type") == t]
    sub.sort(key=lambda x: (x.get("hits", 0), x.get("top1_hits", 0), x.get("unique_queries", 0)), reverse=True)
    top = sub[:TOPK]
    out_lines.append(f"===== {t} | top{TOPK} by hits =====")
    for i, r in enumerate(top, 1):
        out_lines.append(f"{i}. cid={r['case_id']} hits={r['hits']} uq={r.get('unique_queries',0)} top1={r.get('top1_hits',0)}")
        out_lines.append(r.get("post", ""))  # posts_cleaned
        out_lines.append("")
    out_lines.append("")

# 打印 + 保存
print("\n".join(out_lines))
with open("top4_posts_ISTP_ISFJ.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(out_lines))
print("✅ 已保存: top4_posts_ISTP_ISFJ.txt")


===== ISTP | top4 by hits =====
1. cid=18467 hits=9 uq=9 top1=3
people say istps cold unemotional lazy thinking process feeling display fix problem instead talking want drama elsewhere give task challenge space alone recharge show made need explain every move thought observing adapting core emotion come get measured analyzed either used discarded fuss extra noise yeah sometimes get impatient social nicety small talk waste time serve purpose prefer straight answer action want understand watch handle pressure fix something broken real story end fitting mold proving something others living life make sense cutting bullshit

2. cid=15075 hits=5 uq=5 top1=4
cut noise something need fixing figure broken fix waste time whining explaining people talk much little chase approval drama keep circle small quality quantity say done mean game feeling fact action want advice ready work otherwise save breath life short nonsense overthink past future focus front tool task result trust judgment anyone els

In [17]:
# plot_casebank_highusage_top10.py
# -*- coding: utf-8 -*-
"""
功能：
- 读取合并文件 case_meta_with_hits.json（若不存在则自动由 counts+embeddings+train 构建）
- PCA(50)->UMAP(2)（没装 umap 就退化到 PCA2；再不行用 t-SNE）
- 背景点淡化，高使用点高亮
- 仅标注 Top10，并用 adjustText 自动避让（无该库则回退到偏移+箭头）
- 导出 PDF/SVG/600dpi PNG

可修改参数见【参数区】。
"""
import os, json, csv, re, random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# ======================= 参数区 =======================
MERGED_JSON = "case_meta_with_hits.json"                 # 优先读取（不存在则自动构建）
COUNTS_CSV  = "case_usage_all.csv"                       # 构建所需：命中统计
EMB_JSON    = "casebank_A_train_80_with_embeddings.json" # 构建所需：embedding 文件
TRAIN_JSON  = "train.json"                               # 构建所需：原始文本
TEXT_FIELD  = "posts_cleaned"                            # 文本字段名（train/embedding）

TOPN_HIGHLIGHT = 300     # 高使用点高亮：命中次数Top-N（None仅按分位阈值）
Q_PERCENTILE   = 99      # 或命中次数≥该分位也高亮（None关闭）
LABEL_TOPN     = 10      # 只标注前10个
LABEL_SCOPE    = "global"  # "global" 在全体里选Top10；"mask" 只在高使用里选Top10
LABEL_FMT      = "{cid}|{typ}"  # 标注文本格式
SEED           = 0

OUT_PDF       = "casebank_umap_highusage.pdf"
OUT_SVG       = "casebank_umap_highusage.svg"
OUT_PNG       = "casebank_umap_highusage.png"
OUT_PNG_ANNOT = "casebank_umap_highusage_annotated.png"
# =====================================================

# 工具函数
WS = re.compile(r"\s+")
def key_prefix80_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())[:80]

def set_seeds(seed=0):
    random.seed(seed); np.random.seed(seed)

try:
    import umap  # noqa: F401
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False

def build_merged_from_sources():
    """当 MERGED_JSON 不存在时，从 COUNTS_CSV + EMB_JSON + TRAIN_JSON 合并。"""
    # 1) train：id->type/text/key
    with open(TRAIN_JSON, "r", encoding="utf-8") as f:
        train = json.load(f)
    id2type, id2text, id2key = {}, {}, {}
    for i, it in enumerate(train):
        t = it.get(TEXT_FIELD, "") or it.get("posts", "")
        id2type[i] = it.get("type", "")
        id2text[i] = t
        id2key[i]  = key_prefix80_space(t)

    # 2) embeddings：key->emb
    with open(EMB_JSON, "r", encoding="utf-8") as f:
        emb_rows = json.load(f)
    key2emb = {}
    for it in emb_rows:
        t = it.get(TEXT_FIELD, "") or it.get("posts", "")
        if isinstance(t, str) and t and "embedding" in it:
            key2emb[key_prefix80_space(t)] = it["embedding"]

    # 3) 命中统计：cid->hits
    hits = {}
    with open(COUNTS_CSV, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            cid = int(row["case_id"])
            hits[cid] = int(row["hits"])

    # 4) 合并
    merged, miss_emb = [], 0
    for cid in range(len(train)):
        k = id2key.get(cid, None)
        emb = key2emb.get(k)
        if emb is None:
            miss_emb += 1
            continue
        merged.append({
            "case_id": cid,
            "type": id2type.get(cid, ""),
            "text": id2text.get(cid, ""),
            "hits": hits.get(cid, 0),
            "embedding": emb
        })
    with open(MERGED_JSON, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    print(f"[merge] 构建 merged：{len(merged)} 条（缺失向量 {miss_emb} 条）→ {MERGED_JSON}")
    return merged

def load_merged():
    if os.path.exists(MERGED_JSON):
        with open(MERGED_JSON, "r", encoding="utf-8") as f:
            merged = json.load(f)
        print(f"[load] 读取 merged：{len(merged)} 条 ← {MERGED_JSON}")
        return merged
    print("[load] 未找到 case_meta_with_hits.json，自动从三源合并构建…")
    return build_merged_from_sources()

def main():
    set_seeds(SEED)
    merged = load_merged()
    if not merged:
        raise RuntimeError("没有可用数据，请检查输入路径。")

    # 数据取出
    X = np.array([m["embedding"] for m in merged], dtype=np.float32)
    hits = np.array([m["hits"] for m in merged], dtype=np.int32)
    types = [m.get("type","") for m in merged]
    cids  = [m["case_id"] for m in merged]

    # 降维：PCA(50)->UMAP(2)；无 UMAP 则 PCA2；再不行用 TSNE
    try:
        X50 = PCA(n_components=min(50, max(2, X.shape[1]-1)), random_state=SEED).fit_transform(X)
        if HAS_UMAP:
            import umap
            reducer = umap.UMAP(n_neighbors=30, min_dist=0.1, metric="cosine", random_state=SEED)
            Z_2d = reducer.fit_transform(X50)
        else:
            Z_2d = PCA(n_components=2, random_state=SEED).fit_transform(X50)
    except Exception:
        from sklearn.manifold import TSNE
        Z_2d = TSNE(n_components=2, random_state=SEED, perplexity=30, init="pca").fit_transform(X)

    # 高使用集合 mask
    N = len(hits)
    order = np.argsort(-hits)
    mask = np.zeros(N, dtype=bool)
    if TOPN_HIGHLIGHT:
        mask[order[:min(TOPN_HIGHLIGHT, N)]] = True
    if Q_PERCENTILE is not None:
        thr = float(np.percentile(hits, Q_PERCENTILE))
        mask |= (hits >= thr)

    # 绘图（论文风格）
    fig = plt.figure(figsize=(6.5, 4.5), dpi=300)  # 单栏尺寸
    ax = plt.gca()
    ax.scatter(Z_2d[~mask, 0], Z_2d[~mask, 1], s=4,  alpha=0.15, linewidths=0, label="others")
    ax.scatter(Z_2d[mask, 0],  Z_2d[mask, 1],  s=18, alpha=0.9,  linewidths=0.4, edgecolors="k", label="high-usage")
    ax.set_xlabel("UMAP-1"); ax.set_ylabel("UMAP-2")
    ax.legend(frameon=False, loc="upper left")
    ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(OUT_PDF); plt.savefig(OUT_SVG); plt.savefig(OUT_PNG, dpi=600)
    print(f"[save] 图已保存：{OUT_PDF} / {OUT_SVG} / {OUT_PNG}")

    # —— 只标注 Top10，并尽量避免遮挡 —— #
    if LABEL_TOPN and LABEL_TOPN > 0:
        if LABEL_SCOPE == "mask":
            mask_idx  = np.where(mask)[0]
            top_idx   = mask_idx[np.argsort(-hits[mask])][:min(LABEL_TOPN, len(mask_idx))]
        else:  # global
            top_idx = order[:min(LABEL_TOPN, N)]

        # 优先使用 adjustText 自动避让
        used_adjust = False
        try:
            from adjustText import adjust_text
            texts = []
            for idx in top_idx:
                x, y = Z_2d[idx]
                lbl = LABEL_FMT.format(cid=cids[idx], typ=types[idx])
                t = ax.text(
                    x, y, lbl, fontsize=7, zorder=5,
                    bbox=dict(facecolor="white", alpha=0.85, lw=0, pad=0.6)
                )
                texts.append(t)
            adjust_text(
                texts,
                only_move={'points': 'y', 'text': 'xy'},
                expand_points=(1.2, 1.2), expand_text=(1.2, 1.2),
                arrowprops=dict(arrowstyle='-', lw=0.6, color='0.25', alpha=0.8)
            )
            used_adjust = True
        except Exception:
            # 回退方案：固定少量偏移 + 细箭头
            offsets = [(12,6), (-12,6), (12,-6), (-12,-6), (18,0),
                       (-18,0), (0,10), (0,-10), (20,8), (-20,8)]
            for off, idx in zip(offsets, top_idx):
                x, y = Z_2d[idx]
                lbl = LABEL_FMT.format(cid=cids[idx], typ=types[idx])
                ax.annotate(
                    lbl, xy=(x, y), xycoords='data',
                    xytext=off, textcoords='offset points',
                    fontsize=7, zorder=5,
                    bbox=dict(facecolor="white", alpha=0.85, lw=0, pad=0.6),
                    arrowprops=dict(arrowstyle='-', lw=0.6, color='0.25', alpha=0.8)
                )

        plt.tight_layout()
        plt.savefig(OUT_PNG_ANNOT, dpi=600)
        print(f"[save] 带标注版本：{OUT_PNG_ANNOT}  ({'adjustText' if used_adjust else 'fallback offsets'})")

    # 图注模板（粘到论文里）
    desc = (
        "2D visualization of casebank embeddings using PCA(50)→UMAP(2) "
        f"(n_neighbors=30, min_dist=0.1, cosine, random_state={SEED}). "
        f"Orange points denote high-usage cases (Top-{TOPN_HIGHLIGHT} ∪ {Q_PERCENTILE}th percentile). "
        f"Labels show Top-{LABEL_TOPN} by hits with overlap avoidance."
    )
    print("\n[FIGURE CAPTION SUGGESTION]\n" + desc)

if __name__ == "__main__":
    main()


[load] 读取 merged：27223 条 ← case_meta_with_hits.json
[save] 图已保存：casebank_umap_highusage.pdf / casebank_umap_highusage.svg / casebank_umap_highusage.png
[save] 带标注版本：casebank_umap_highusage_annotated.png  (fallback offsets)

[FIGURE CAPTION SUGGESTION]
2D visualization of casebank embeddings using PCA(50)→UMAP(2) (n_neighbors=30, min_dist=0.1, cosine, random_state=0). Orange points denote high-usage cases (Top-300 ∪ 99th percentile). Labels show Top-10 by hits with overlap avoidance.


In [1]:
# plot_types_tsne_starburst_top1_id_only.py
# -*- coding: utf-8 -*-
"""
t-SNE（天女散花）可视化：
- 读取 case_meta_with_hits.json（若不存在则自动由 counts+embeddings+train 合并生成）
- 16个MBTI类型分别着色（颜色更鲜明），底层点较大且不透明度高一些
- 每个type仅高亮命中最多的1个样本（黑色细边），标签只显示 case_id
- 导出 PDF/SVG/600dpi PNG
"""

import os, json, csv, re, random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ====== 路径与字段 ======
MERGED_JSON = "case_meta_with_hits.json"                 # 优先读取
COUNTS_CSV  = "case_usage_all.csv"                       # 备选合并来源
EMB_JSON    = "casebank_A_train_80_with_embeddings.json" # 备选合并来源
TRAIN_JSON  = "train.json"                               # 备选合并来源
TEXT_FIELD  = "posts_cleaned"

# ====== t-SNE 参数（决定“天女散花”效果）======
SEED = 0
TSNE_PERPLEXITY = 30
TSNE_EE         = 12          # early_exaggeration
TSNE_LR         = 200         # learning_rate
TSNE_ITER       = 1500
PCA_DIM         = 50          # 先PCA再t-SNE更稳定

# ====== 绘图样式（更鲜明） ======
# 图例顺序（仅用于图例，不影响数据）
MBTI_16 = [
    "ENFJ","ENFP","ENTJ","ENTP","ESFJ","ESFP","ESTJ","ESTP",
    "INFJ","INFP","INTJ","INTP","ISFJ","ISFP","ISTJ","ISTP"
]
# 底层点（所有样本，按type着色）
BASE_SIZE  = 16
BASE_ALPHA = 0.85
BASE_EDGE_LW = 0.2            # 白色细边让群簇边界更清晰
# 高亮点（各type命中Top-1）
HL_SIZE    = 140
HL_EDGE_LW = 1.0
LABEL_FMT  = "{cid}"          # 只显示 case_id
LABEL_FONTSIZE = 9
LABEL_WITH_BOX = True         # 给标签加白底

OUT_PDF = "types_tsne_starburst_idonly.pdf"
OUT_SVG = "types_tsne_starburst_idonly.svg"
OUT_PNG = "types_tsne_starburst_idonly.png"

# ====== 小工具 ======
WS = re.compile(r"\s+")
def key_prefix80_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())[:80]

def set_seeds(seed=0):
    random.seed(seed); np.random.seed(seed)

def vivid_palette_16():
    """
    使用更鲜明的调色：以 tab10 为主色，补充 tab20 中对比较强的颜色，共16个。
    """
    tab10 = list(plt.get_cmap("tab10").colors)  # 10个、鲜明
    tab20 = list(plt.get_cmap("tab20").colors)
    # 从 tab20 里挑 6 个对比度高的颜色补足 16
    extra_idx = [1, 3, 5, 7, 9, 11]             # 交替抽取饱和度更高的一组
    extras = [tab20[i] for i in extra_idx[:6]]
    cols = tab10 + extras
    return {t: cols[i] for i, t in enumerate(MBTI_16)}

def build_merged_from_sources():
    # 1) train：id->type/text/key
    with open(TRAIN_JSON, "r", encoding="utf-8") as f:
        train = json.load(f)
    id2type, id2text, id2key = {}, {}, {}
    for i, it in enumerate(train):
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        id2type[i] = it.get("type","")
        id2text[i] = t
        id2key[i]  = key_prefix80_space(t)

    # 2) embedding：key->emb
    with open(EMB_JSON, "r", encoding="utf-8") as f:
        emb_rows = json.load(f)
    key2emb = {}
    for it in emb_rows:
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        if isinstance(t, str) and t and "embedding" in it:
            key2emb[key_prefix80_space(t)] = it["embedding"]

    # 3) hits：cid->hits
    hits = {}
    with open(COUNTS_CSV, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            hits[int(row["case_id"])] = int(row["hits"])

    # 4) 合并
    merged, miss = [], 0
    for cid in range(len(train)):
        emb = key2emb.get(id2key.get(cid,""))
        if emb is None:
            miss += 1
            continue
        merged.append({
            "case_id": cid,
            "type": id2type.get(cid,""),
            "text": id2text.get(cid,""),
            "hits": hits.get(cid, 0),
            "embedding": emb
        })
    with open(MERGED_JSON, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    print(f"[merge] 构建 merged：{len(merged)} 条（缺失向量 {miss}）→ {MERGED_JSON}")
    return merged

def load_merged():
    if os.path.exists(MERGED_JSON):
        with open(MERGED_JSON, "r", encoding="utf-8") as f:
            merged = json.load(f)
        print(f"[load] {len(merged)} 条 ← {MERGED_JSON}")
        return merged
    print("[load] 未找到 merged，自动从三源合并…")
    return build_merged_from_sources()

# ====== 主流程 ======
def main():
    set_seeds(SEED)
    merged = load_merged()
    assert merged, "没有可用数据"

    X   = np.array([m["embedding"] for m in merged], dtype=np.float32)
    H   = np.array([m["hits"] for m in merged], dtype=np.int32)
    T   = np.array([m.get("type","") for m in merged])
    CID = np.array([m["case_id"] for m in merged])

    # 先 PCA 再 t-SNE（更稳定、更容易出“放射状”）
    X50 = PCA(n_components=min(PCA_DIM, max(2, X.shape[1]-1)), random_state=SEED).fit_transform(X)
    tsne = TSNE(
        n_components=2,
        perplexity=TSNE_PERPLEXITY,
        early_exaggeration=TSNE_EE,
        learning_rate=TSNE_LR,
        n_iter=TSNE_ITER,
        init="pca",
        random_state=SEED,
        angle=0.5,         # Barnes–Hut
        verbose=1,
    )
    Z = tsne.fit_transform(X50)

    # 每个type找“命中最多”的那个索引；若并列取第一个
    top_idx_per_type = {}
    for typ in np.unique(T):
        mask = (T == typ)
        if not mask.any(): 
            continue
        idxs = np.where(mask)[0]
        best = idxs[np.argmax(H[idxs])]
        top_idx_per_type[typ] = best

    # 颜色（更鲜明）
    pal = vivid_palette_16()

    # 绘图
    fig = plt.figure(figsize=(7.5, 5.5), dpi=300)
    ax = plt.gca()

    # 底层点：颜色更“实”、稍大、加白色细边
    for t in MBTI_16:
        mt = (T == t)
        if not mt.any(): 
            continue
        ax.scatter(Z[mt,0], Z[mt,1],
                   s=BASE_SIZE, alpha=BASE_ALPHA,
                   linewidths=BASE_EDGE_LW, edgecolors="white",
                   color=pal[t], label=t)

    # 高亮每个type的Top-1：黑边+更大
    for t in MBTI_16:
        if t not in top_idx_per_type: 
            continue
        i = top_idx_per_type[t]
        ax.scatter(Z[i,0], Z[i,1],
                   s=HL_SIZE, alpha=1.0,
                   linewidths=HL_EDGE_LW, edgecolors="black",
                   color=pal.get(t, "k"), zorder=5)
        lbl = LABEL_FMT.format(cid=CID[i])
        kw = dict(fontsize=LABEL_FONTSIZE, zorder=6)
        if LABEL_WITH_BOX:
            kw["bbox"] = dict(facecolor="white", alpha=0.9, lw=0, pad=0.5)
        ax.text(Z[i,0], Z[i,1], lbl, **kw)

    ax.set_xlabel("t-SNE-1")
    ax.set_ylabel("t-SNE-2")
    ax.set_title("FEM Personality Embedding Space (t-SNE)\nTop-1 by hits per Type (ID labels)", fontsize=12)

    # 图例（只显示type，不再显示 n）
    lg = ax.legend(frameon=False, bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0.)
    for txt in lg.get_texts(): txt.set_fontsize(9)

    # 去掉上/右边框
    ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)

    plt.tight_layout()
    plt.savefig(OUT_PDF,  bbox_inches="tight")
    plt.savefig(OUT_SVG,  bbox_inches="tight")
    plt.savefig(OUT_PNG,  dpi=600, bbox_inches="tight")
    print(f"✅ 保存：{OUT_PDF} / {OUT_SVG} / {OUT_PNG}")

if __name__ == "__main__":
    main()


[load] 27223 条 ← case_meta_with_hits.json




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 27223 samples in 0.001s...
[t-SNE] Computed neighbors for 27223 samples in 0.455s...
[t-SNE] Computed conditional probabilities for sample 1000 / 27223
[t-SNE] Computed conditional probabilities for sample 2000 / 27223
[t-SNE] Computed conditional probabilities for sample 3000 / 27223
[t-SNE] Computed conditional probabilities for sample 4000 / 27223
[t-SNE] Computed conditional probabilities for sample 5000 / 27223
[t-SNE] Computed conditional probabilities for sample 6000 / 27223
[t-SNE] Computed conditional probabilities for sample 7000 / 27223
[t-SNE] Computed conditional probabilities for sample 8000 / 27223
[t-SNE] Computed conditional probabilities for sample 9000 / 27223
[t-SNE] Computed conditional probabilities for sample 10000 / 27223
[t-SNE] Computed conditional probabilities for sample 11000 / 27223
[t-SNE] Computed conditional probabilities for sample 12000 / 27223
[t-SNE] Computed conditional probabilities for sam

In [5]:
# plot_types_tsne_starburst_top1_hits_only.py
# -*- coding: utf-8 -*-
"""
t-SNE（天女散花）可视化：
- 读取 case_meta_with_hits.json（若不存在则自动由 counts+embeddings+train 合并生成）
- 16个MBTI类型分别着色（颜色更鲜明），底层点较大且不透明度高一些
- 每个type仅高亮命中最多的1个样本（黑色细边），标签显示“命中次数 hits”
- 右侧增加脚注图示：说明 label = hits
- 导出 PDF/SVG/600dpi PNG
"""

import os, json, csv, re, random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ====== 路径与字段 ======
MERGED_JSON = "case_meta_with_hits.json"                 # 优先读取
COUNTS_CSV  = "case_usage_all.csv"                       # 备选合并来源
EMB_JSON    = "casebank_A_train_80_with_embeddings.json" # 备选合并来源
TRAIN_JSON  = "train.json"                               # 备选合并来源
TEXT_FIELD  = "posts_cleaned"

# ====== t-SNE 参数（决定“天女散花”效果）======
SEED = 0
TSNE_PERPLEXITY = 30
TSNE_EE         = 12
TSNE_LR         = 200
TSNE_ITER       = 1500
PCA_DIM         = 50

# ====== 绘图样式（更鲜明） ======
MBTI_16 = [
    "ENFJ","ENFP","ENTJ","ENTP","ESFJ","ESFP","ESTJ","ESTP",
    "INFJ","INFP","INTJ","INTP","ISFJ","ISFP","ISTJ","ISTP"
]
BASE_SIZE   = 16
BASE_ALPHA  = 0.85
BASE_EDGE_LW = 0.2            # 白色细边让群簇边界更清晰

HL_SIZE     = 140             # 高亮点大小
HL_EDGE_LW  = 1.0             # 高亮黑边
LABEL_FONTSIZE = 9
LABEL_WITH_BOX = True         # 标签白底

# 想避免出现“0”，把它设为 1；保持 0 则也会显示 0 命中（当该类都未命中时）
MIN_HITS_FOR_HIGHLIGHT = 0

OUT_PDF = "types_tsne_starburst_hitsonly.pdf"
OUT_SVG = "types_tsne_starburst_hitsonly.svg"
OUT_PNG = "types_tsne_starburst_hitsonly.png"

# ====== 小工具 ======
WS = re.compile(r"\s+")
def key_prefix80_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())[:80]

def set_seeds(seed=0):
    random.seed(seed); np.random.seed(seed)

def vivid_palette_16():
    # tab10 主色 + tab20 里对比度高的色，拼成 16 色
    tab10 = list(plt.get_cmap("tab10").colors)
    tab20 = list(plt.get_cmap("tab20").colors)
    extras = [tab20[i] for i in [1,3,5,7,9,11]]
    cols = tab10 + extras
    return {t: cols[i] for i, t in enumerate(MBTI_16)}

def build_merged_from_sources():
    with open(TRAIN_JSON, "r", encoding="utf-8") as f:
        train = json.load(f)
    id2type, id2text, id2key = {}, {}, {}
    for i, it in enumerate(train):
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        id2type[i] = it.get("type","")
        id2text[i] = t
        id2key[i]  = key_prefix80_space(t)

    with open(EMB_JSON, "r", encoding="utf-8") as f:
        emb_rows = json.load(f)
    key2emb = {}
    for it in emb_rows:
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        if isinstance(t, str) and t and "embedding" in it:
            key2emb[key_prefix80_space(t)] = it["embedding"]

    hits = {}
    with open(COUNTS_CSV, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            hits[int(row["case_id"])] = int(row["hits"])

    merged, miss = [], 0
    for cid in range(len(train)):
        emb = key2emb.get(id2key.get(cid,""))
        if emb is None:
            miss += 1
            continue
        merged.append({
            "case_id": cid,
            "type": id2type.get(cid,""),
            "text": id2text.get(cid,""),
            "hits": hits.get(cid, 0),
            "embedding": emb
        })
    with open(MERGED_JSON, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    print(f"[merge] 构建 merged：{len(merged)} 条（缺失向量 {miss}）→ {MERGED_JSON}")
    return merged

def load_merged():
    if os.path.exists(MERGED_JSON):
        with open(MERGED_JSON, "r", encoding="utf-8") as f:
            return json.load(f)
    print("[load] 未找到 merged，自动从三源合并…")
    return build_merged_from_sources()

# ====== 主流程 ======
def main():
    set_seeds(SEED)
    merged = load_merged()
    assert merged, "没有可用数据"

    X   = np.array([m["embedding"] for m in merged], dtype=np.float32)
    H   = np.array([m["hits"] for m in merged], dtype=np.int32)
    T   = np.array([m.get("type","") for m in merged])

    # PCA -> t-SNE
    X50 = PCA(n_components=min(PCA_DIM, max(2, X.shape[1]-1)), random_state=SEED).fit_transform(X)
    Z = TSNE(n_components=2, perplexity=TSNE_PERPLEXITY, early_exaggeration=TSNE_EE,
             learning_rate=TSNE_LR, n_iter=TSNE_ITER, init="pca",
             random_state=SEED, angle=0.5, verbose=1).fit_transform(X50)

    # 每个type找“命中最多”的那个；若并列取第一个；可选过滤 min hits
    top_idx_per_type = {}
    for typ in np.unique(T):
        mask = (T == typ)
        if not mask.any():
            continue
        idxs_all = np.where(mask)[0]
        idxs_pos = idxs_all[H[idxs_all] >= MIN_HITS_FOR_HIGHLIGHT]
        if idxs_pos.size == 0:
            # 该类没有达到阈值的命中样本，则不高亮
            continue
        best = idxs_pos[np.argmax(H[idxs_pos])]
        top_idx_per_type[typ] = best

    pal = vivid_palette_16()

    # 绘图
    fig = plt.figure(figsize=(7.5, 5.5), dpi=300)
    ax = plt.gca()

    # 背景点（所有样本）
    type_handles = []
    for t in MBTI_16:
        mt = (T == t)
        if not mt.any(): 
            continue
        sc = ax.scatter(Z[mt,0], Z[mt,1],
                        s=BASE_SIZE, alpha=BASE_ALPHA,
                        linewidths=BASE_EDGE_LW, edgecolors="white",
                        color=pal[t], label=t)
        type_handles.append(sc)

    # 高亮每个type的Top-1：标签=hits
    for t in MBTI_16:
        if t not in top_idx_per_type:
            continue
        i = top_idx_per_type[t]
        ax.scatter(Z[i,0], Z[i,1],
                   s=HL_SIZE, alpha=1.0,
                   linewidths=HL_EDGE_LW, edgecolors="black",
                   color=pal.get(t, "k"), zorder=5)
        lbl = f"{int(H[i])}"          # ← 只显示命中次数
        kw  = dict(fontsize=LABEL_FONTSIZE, zorder=6)
        if LABEL_WITH_BOX:
            kw["bbox"] = dict(facecolor="white", alpha=0.9, lw=0, pad=0.5)
        ax.text(Z[i,0], Z[i,1], lbl, **kw)

    ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
    ax.set_title("FEM Personality Embedding Space (t-SNE)\nTop-1 by hits per Type (label = hits)", fontsize=12)

    # 图例：类型颜色
    leg1 = ax.legend(frameon=False, bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0., title="Types")
    for txt in leg1.get_texts(): txt.set_fontsize(9)
    ax.add_artist(leg1)

    # 右侧脚注：显示规则（样本 / Top-1，label=hits）
    sample_handle = Line2D([0],[0], marker='o', color='w',
                           markerfacecolor='0.6', markeredgecolor='white',
                           markeredgewidth=BASE_EDGE_LW, markersize=6, label='sample')
    top1_handle   = Line2D([0],[0], marker='o', color='black',
                           markerfacecolor='tab:orange', markeredgewidth=HL_EDGE_LW,
                           markersize=8, label='Top-1 (label = hits)')
    leg2 = ax.legend(handles=[sample_handle, top1_handle],
                     frameon=False, bbox_to_anchor=(1.02, 0.0), loc="lower left",
                     title="Display guide")
    for txt in leg2.get_texts(): txt.set_fontsize(9)

    ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(OUT_PDF, bbox_inches="tight")
    plt.savefig(OUT_SVG, bbox_inches="tight")
    plt.savefig(OUT_PNG, dpi=600, bbox_inches="tight")
    print(f"✅ 保存：{OUT_PDF} / {OUT_SVG} / {OUT_PNG}")

if __name__ == "__main__":
    main()




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 27223 samples in 0.001s...
[t-SNE] Computed neighbors for 27223 samples in 0.426s...
[t-SNE] Computed conditional probabilities for sample 1000 / 27223
[t-SNE] Computed conditional probabilities for sample 2000 / 27223
[t-SNE] Computed conditional probabilities for sample 3000 / 27223
[t-SNE] Computed conditional probabilities for sample 4000 / 27223
[t-SNE] Computed conditional probabilities for sample 5000 / 27223
[t-SNE] Computed conditional probabilities for sample 6000 / 27223
[t-SNE] Computed conditional probabilities for sample 7000 / 27223
[t-SNE] Computed conditional probabilities for sample 8000 / 27223
[t-SNE] Computed conditional probabilities for sample 9000 / 27223
[t-SNE] Computed conditional probabilities for sample 10000 / 27223
[t-SNE] Computed conditional probabilities for sample 11000 / 27223
[t-SNE] Computed conditional probabilities for sample 12000 / 27223
[t-SNE] Computed conditional probabilities for sam

In [7]:
# plot_types_tsne_starburst_topk_hits_ring.py
# -*- coding: utf-8 -*-
"""
t-SNE（天女散花）可视化：
- 读取 case_meta_with_hits.json（若无则由 counts+embeddings+train 自动合并生成）
- 16 个 MBTI 类型着色（鲜明）
- 每个 type 高亮命中最多的 TOPK 个样本：彩色实心点 + 黑色细边的小圆点；标签仅显示“命中次数”
- 右侧双图例：类型颜色 & 形状说明（Top-K，label=hits）
- 导出 PDF/SVG/600dpi PNG
"""

import os, json, csv, re, random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ========= 路径与字段 =========
MERGED_JSON = "case_meta_with_hits.json"                 # 优先读取
COUNTS_CSV  = "case_usage_all.csv"                       # 备选合并来源
EMB_JSON    = "casebank_A_train_80_with_embeddings.json" # 备选合并来源
TRAIN_JSON  = "train.json"                               # 备选合并来源
TEXT_FIELD  = "posts_cleaned"

# ========= t-SNE 超参（决定“天女散花”感）=========
SEED = 0
TSNE_PERPLEXITY = 30
TSNE_EE         = 12
TSNE_LR         = 200
TSNE_ITER       = 1500
PCA_DIM         = 50

# ========= 绘图样式（鲜明）=========
MBTI_16 = [
    "ENFJ","ENFP","ENTJ","ENTP","ESFJ","ESFP","ESTJ","ESTP",
    "INFJ","INFP","INTJ","INTP","ISFJ","ISFP","ISTJ","ISTP"
]
BASE_SIZE    = 16
BASE_ALPHA   = 0.85
BASE_EDGE_LW = 0.2          # 白色细边让簇边界更清晰

TOPK_PER_TYPE = 1         # ★ 每类高亮几个
HL_RING_SIZE  = 80          # 高亮点尺寸（小圆点）
HL_EDGE_LW    = 0.9         # 黑色细边
LABEL_FONTSZ  = 9           # 标签字号
LABEL_BOX     = True        # 标签白底
USE_ADJUSTTEXT = True       # 若环境有 adjustText，则自动避让；否则用小偏移

OUT_PDF = "types_tsne_starburst_topk_hits_ring.pdf"
OUT_SVG = "types_tsne_starburst_topk_hits_ring.svg"
OUT_PNG = "types_tsne_starburst_topk_hits_ring.png"

# ========= 工具 =========
WS = re.compile(r"\s+")
def key_prefix80_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())[:80]

def set_seeds(seed=0):
    random.seed(seed); np.random.seed(seed)

def vivid_palette_16():
    """更鲜明的 16 色：tab10 + tab20 中饱和色"""
    tab10 = list(plt.get_cmap("tab10").colors)          # 10
    tab20 = list(plt.get_cmap("tab20").colors)          # 20
    extras = [tab20[i] for i in [1,3,5,7,9,11]]          # 取 6 个对比度高的补足
    cols = tab10 + extras
    return {t: cols[i] for i, t in enumerate(MBTI_16)}

def build_merged_from_sources():
    # 1) train：id->type/text/key
    with open(TRAIN_JSON, "r", encoding="utf-8") as f:
        train = json.load(f)
    id2type, id2text, id2key = {}, {}, {}
    for i, it in enumerate(train):
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        id2type[i] = it.get("type","")
        id2text[i] = t
        id2key[i]  = key_prefix80_space(t)

    # 2) embedding：key->emb
    with open(EMB_JSON, "r", encoding="utf-8") as f:
        emb_rows = json.load(f)
    key2emb = {}
    for it in emb_rows:
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        if isinstance(t, str) and t and "embedding" in it:
            key2emb[key_prefix80_space(t)] = it["embedding"]

    # 3) hits：cid->hits
    hits = {}
    with open(COUNTS_CSV, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            hits[int(row["case_id"])] = int(row["hits"])

    # 4) 合并
    merged, miss = [], 0
    for cid in range(len(train)):
        emb = key2emb.get(id2key.get(cid,""))
        if emb is None:
            miss += 1
            continue
        merged.append({
            "case_id": cid,
            "type": id2type.get(cid,""),
            "text": id2text.get(cid,""),
            "hits": hits.get(cid, 0),
            "embedding": emb
        })
    with open(MERGED_JSON, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    print(f"[merge] 构建 merged：{len(merged)} 条（缺失向量 {miss}）→ {MERGED_JSON}")
    return merged

def load_merged():
    if os.path.exists(MERGED_JSON):
        with open(MERGED_JSON, "r", encoding="utf-8") as f:
            merged = json.load(f)
        print(f"[load] {len(merged)} 条 ← {MERGED_JSON}")
        return merged
    print("[load] 未找到 merged，自动从三源合并…")
    return build_merged_from_sources()

# ========= 主流程 =========
def main():
    set_seeds(SEED)
    merged = load_merged()
    assert merged, "没有可用数据"

    X = np.array([m["embedding"] for m in merged], dtype=np.float32)
    H = np.array([m["hits"] for m in merged], dtype=np.int32)
    T = np.array([m.get("type","") for m in merged])

    # PCA -> t-SNE（稳定出“放射/花朵”）
    X50 = PCA(n_components=min(PCA_DIM, max(2, X.shape[1]-1)), random_state=SEED).fit_transform(X)
    Z = TSNE(
        n_components=2, perplexity=TSNE_PERPLEXITY,
        early_exaggeration=TSNE_EE, learning_rate=TSNE_LR,
        n_iter=TSNE_ITER, init="pca", random_state=SEED, angle=0.5, verbose=1
    ).fit_transform(X50)

    # 每个 type 取命中最多的 TOPK
    top_idx_per_type = {}
    for typ in np.unique(T):
        mt = (T == typ)
        if not mt.any(): 
            continue
        idx = np.where(mt)[0]
        order = idx[np.argsort(-H[idx])]
        top_idx_per_type[typ] = order[:min(TOPK_PER_TYPE, len(order))]

    pal = vivid_palette_16()

    fig = plt.figure(figsize=(7.5, 5.5), dpi=300)
    ax = plt.gca()

    # 先画“底层”所有点
    legends_type = []
    for t in MBTI_16:
        mt = (T == t)
        if not mt.any(): continue
        ax.scatter(
            Z[mt,0], Z[mt,1],
            s=BASE_SIZE, alpha=BASE_ALPHA,
            linewidths=BASE_EDGE_LW, edgecolors="white",
            color=pal[t]
        )
        legends_type.append(Line2D([0],[0], marker='o', color='w',
                                   markerfacecolor=pal[t], markeredgecolor='white',
                                   markeredgewidth=BASE_EDGE_LW, markersize=7, label=t))

    # 高亮 Top-K：彩色实心点 + 黑色细边；标签=命中次数
    texts = []
    use_adjust = False
    for t in MBTI_16:
        if t not in top_idx_per_type: 
            continue
        for i in top_idx_per_type[t]:
            x, y = Z[i,0], Z[i,1]
            ax.scatter(x, y,
                       s=HL_RING_SIZE, alpha=1.0,
                       linewidths=HL_EDGE_LW, edgecolors="black",
                       color=pal.get(t, "k"), zorder=6)
            # 仅 hits
            txt = f"{int(H[i])}"
            if USE_ADJUSTTEXT:
                texts.append(ax.text(x, y, txt,
                                     fontsize=LABEL_FONTSZ, zorder=7,
                                     bbox=dict(facecolor="white", alpha=0.95, lw=0, pad=0.5) if LABEL_BOX else None))
            else:
                # 简单偏移防遮挡
                ax.annotate(txt, xy=(x, y), xycoords='data',
                            xytext=(10, 6), textcoords='offset points',
                            fontsize=LABEL_FONTSZ, zorder=7,
                            bbox=dict(facecolor="white", alpha=0.95, lw=0, pad=0.5) if LABEL_BOX else None)

    # 调整标签避免遮挡（若可用）
    if USE_ADJUSTTEXT and texts:
        try:
            from adjustText import adjust_text
            adjust_text(texts,
                        only_move={'points': 'y', 'text': 'xy'},
                        expand_points=(1.2, 1.2), expand_text=(1.2, 1.2))
            use_adjust = True
        except Exception:
            pass

    # ===== 图例：类型颜色 =====
    leg1 = ax.legend(handles=legends_type, frameon=False,
                     bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0., title="Types")
    for txt in leg1.get_texts(): txt.set_fontsize(9)
    ax.add_artist(leg1)

    # ===== 图例：形状说明 =====
    sample_handle = Line2D([0],[0], marker='o', color='w',
                           markerfacecolor='0.5', markeredgecolor='white',
                           markeredgewidth=BASE_EDGE_LW, markersize=6, label='sample')
    topk_handle = Line2D([0],[0], marker='o', color='black',
                         markerfacecolor='tab:orange', markeredgewidth=HL_EDGE_LW,
                         markersize=8, label=f'Top-{TOPK_PER_TYPE} (label = hits)')
    leg2 = ax.legend(handles=[sample_handle, topk_handle],
                     frameon=False, bbox_to_anchor=(1.02, 0.0), loc="lower left",
                     title="Display guide")
    for txt in leg2.get_texts(): txt.set_fontsize(9)

    ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
    ax.set_title(f"FEM Personality Embedding Space (t-SNE)\nTop-{TOPK_PER_TYPE} per Type (label shows hits)", fontsize=12)
    ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)

    plt.tight_layout()
    plt.savefig(OUT_PDF, bbox_inches="tight")
    plt.savefig(OUT_SVG, bbox_inches="tight")
    plt.savefig(OUT_PNG, dpi=600, bbox_inches="tight")
    print(f"✅ 保存：{OUT_PDF} / {OUT_SVG} / {OUT_PNG}   （标签自动避让：{'√' if use_adjust else '×'}）")

if __name__ == "__main__":
    main()


[load] 27223 条 ← case_meta_with_hits.json




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 27223 samples in 0.001s...
[t-SNE] Computed neighbors for 27223 samples in 0.407s...
[t-SNE] Computed conditional probabilities for sample 1000 / 27223
[t-SNE] Computed conditional probabilities for sample 2000 / 27223
[t-SNE] Computed conditional probabilities for sample 3000 / 27223
[t-SNE] Computed conditional probabilities for sample 4000 / 27223
[t-SNE] Computed conditional probabilities for sample 5000 / 27223
[t-SNE] Computed conditional probabilities for sample 6000 / 27223
[t-SNE] Computed conditional probabilities for sample 7000 / 27223
[t-SNE] Computed conditional probabilities for sample 8000 / 27223
[t-SNE] Computed conditional probabilities for sample 9000 / 27223
[t-SNE] Computed conditional probabilities for sample 10000 / 27223
[t-SNE] Computed conditional probabilities for sample 11000 / 27223
[t-SNE] Computed conditional probabilities for sample 12000 / 27223
[t-SNE] Computed conditional probabilities for sam

In [8]:
# plot_mbti_4dims_tsne_top3.py
# -*- coding: utf-8 -*-
"""
四维度(E/I, S/N, T/F, J/P)可视化：
- 读取 case_meta_with_hits.json（如无则由 counts+embeddings+train 自动合并）
- 统一用 PCA->t-SNE 得到 2D 坐标
- 2x2 子图：分别画 E/I、S/N、T/F、J/P
- 每个子图：两侧各高亮 Top-3（hits 降序），小圆点+黑边，标签显示命中次数
- 导出 PDF/SVG/600dpi PNG
"""

import os, json, csv, re, random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ---------- 路径 ----------
MERGED_JSON = "case_meta_with_hits.json"
COUNTS_CSV  = "case_usage_all.csv"
EMB_JSON    = "casebank_A_train_80_with_embeddings.json"
TRAIN_JSON  = "train.json"
TEXT_FIELD  = "posts_cleaned"

# ---------- t-SNE 参数 ----------
SEED = 0
PCA_DIM = 50
TSNE_PERPLEXITY = 30
TSNE_EE = 12
TSNE_LR = 200
TSNE_ITER = 1500

# ---------- 绘图样式 ----------
BASE_SIZE = 12
BASE_ALPHA = 0.25
BASE_EDGE_LW = 0.0

HL_SIZE = 80            # 高亮圆点大小
HL_EDGE_LW = 0.9        # 黑色细边
LABEL_FONTSZ = 9
LABEL_BOX = True

TOPK_PER_SIDE = 3       # 每侧 Top-K
MIN_HITS_FOR_HL = 1     # 只高亮 hits >= 1，避免 0

OUT_PDF = "mbti_dims_tsne_top3.pdf"
OUT_SVG = "mbti_dims_tsne_top3.svg"
OUT_PNG = "mbti_dims_tsne_top3.png"

# ---------- 工具 ----------
WS = re.compile(r"\s+")
def key_prefix80_space(s: str) -> str:
    return WS.sub(" ", (s or "").strip())[:80]

def set_seeds(seed=0):
    random.seed(seed); np.random.seed(seed)

def vivid_pair_colors():
    """给四个维度各配两种对比色（可自定义）。"""
    tab10 = list(plt.get_cmap("tab10").colors)
    pairs = {
        "EI": (tab10[0], tab10[1]),   # E, I
        "SN": (tab10[2], tab10[3]),   # S, N
        "TF": (tab10[4], tab10[5]),   # T, F
        "JP": (tab10[6], tab10[7]),   # J, P
    }
    return pairs

def build_merged_from_sources():
    # train
    with open(TRAIN_JSON, "r", encoding="utf-8") as f:
        train = json.load(f)
    id2type, id2text, id2key = {}, {}, {}
    for i, it in enumerate(train):
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        id2type[i] = it.get("type","")
        id2text[i] = t
        id2key[i]  = key_prefix80_space(t)
    # embeddings
    with open(EMB_JSON, "r", encoding="utf-8") as f:
        emb_rows = json.load(f)
    key2emb = {}
    for it in emb_rows:
        t = it.get(TEXT_FIELD, "") or it.get("posts","")
        if isinstance(t, str) and t and "embedding" in it:
            key2emb[key_prefix80_space(t)] = it["embedding"]
    # hits
    hits = {}
    with open(COUNTS_CSV, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            hits[int(row["case_id"])] = int(row["hits"])
    # merge
    out, miss = [], 0
    for cid in range(len(train)):
        emb = key2emb.get(id2key.get(cid,""))
        if emb is None:
            miss += 1; continue
        out.append({
            "case_id": cid,
            "type": id2type.get(cid,""),
            "text": id2text.get(cid,""),
            "hits": hits.get(cid, 0),
            "embedding": emb
        })
    with open(MERGED_JSON, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f"[merge] merged={len(out)}, miss_emb={miss} → {MERGED_JSON}")
    return out

def load_merged():
    if os.path.exists(MERGED_JSON):
        with open(MERGED_JSON, "r", encoding="utf-8") as f:
            return json.load(f)
    print("[load] not found merged, building…")
    return build_merged_from_sources()

def type_to_4d(t: str):
    t = (t or "").upper()
    # 返回四个维度字符（E/I, S/N, T/F, J/P）
    return t[0], t[1], t[2], t[3]

# ---------- 主程序 ----------
def main():
    set_seeds(SEED)
    data = load_merged()
    assert data, "没有数据"

    X = np.array([d["embedding"] for d in data], dtype=np.float32)
    H = np.array([d["hits"]       for d in data], dtype=np.int32)
    T = np.array([d["type"]       for d in data])

    # PCA -> t-SNE（一次坐标，四图共用）
    X50 = PCA(n_components=min(PCA_DIM, max(2, X.shape[1]-1)), random_state=SEED).fit_transform(X)
    Z = TSNE(n_components=2, perplexity=TSNE_PERPLEXITY, early_exaggeration=TSNE_EE,
             learning_rate=TSNE_LR, n_iter=TSNE_ITER, init="pca",
             random_state=SEED, angle=0.5, verbose=1).fit_transform(X50)

    # 解析四维字母
    letters = np.array([type_to_4d(t) for t in T])  # shape (N,4)
    EI = letters[:,0]   # 'E' or 'I'
    SN = letters[:,1]   # 'S' or 'N'
    TF = letters[:,2]   # 'T' or 'F'
    JP = letters[:,3]   # 'J' or 'P'

    DIM_SPECS = [
        ("EI", ("E","I"), EI),
        ("SN", ("S","N"), SN),
        ("TF", ("T","F"), TF),
        ("JP", ("J","P"), JP),
    ]
    pairs = vivid_pair_colors()

    fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300)
    axes = axes.ravel()

    for ax, (dim_tag, (a_char, b_char), side_arr) in zip(axes, DIM_SPECS):
        cA, cB = pairs[dim_tag]
        # 背景：两侧淡色
        mA = (side_arr == a_char)
        mB = (side_arr == b_char)
        ax.scatter(Z[mA,0], Z[mA,1], s=BASE_SIZE, alpha=BASE_ALPHA,
                   linewidths=BASE_EDGE_LW, edgecolors="none", color=cA)
        ax.scatter(Z[mB,0], Z[mB,1], s=BASE_SIZE, alpha=BASE_ALPHA,
                   linewidths=BASE_EDGE_LW, edgecolors="none", color=cB)

        # 各侧 Top-3（仅 hits≥1）
        handles = []
        for side_char, color in [(a_char, cA), (b_char, cB)]:
            m = (side_arr == side_char)
            idx_all = np.where(m)[0]
            idx_pos = idx_all[H[idx_all] >= MIN_HITS_FOR_HL]
            if idx_pos.size == 0:
                continue
            order = idx_pos[np.argsort(-H[idx_pos])]
            top_idx = order[:min(TOPK_PER_SIDE, len(order))]
            # 高亮小圆点 + 黑色细边 + 命中数标签
            for i in top_idx:
                x, y = Z[i,0], Z[i,1]
                ax.scatter(x, y, s=HL_SIZE, color=color, alpha=1.0,
                           linewidths=HL_EDGE_LW, edgecolors="black", zorder=5)
                lbl = f"{int(H[i])}"
                kw = dict(fontsize=LABEL_FONTSZ, zorder=6)
                if LABEL_BOX:
                    kw["bbox"] = dict(facecolor="white", alpha=0.95, lw=0, pad=0.4)
                ax.text(x, y, lbl, **kw)

            # 图例把两侧加进去
            handles.append(Line2D([0],[0], marker='o', color='black',
                                  markerfacecolor=color, markeredgewidth=HL_EDGE_LW,
                                  markersize=7, label=f"{side_char} (Top-3, label=hits)"))

        ax.legend(handles=handles, frameon=False, loc="upper left", fontsize=8)
        ax.set_title(f"{dim_tag} 维度", fontsize=11)
        ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
        ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)

    plt.suptitle("FEM Personality Embedding Space (t-SNE)\nTop-3 per side for each MBTI dimension (label=hits)", fontsize=13)
    plt.tight_layout(rect=[0,0,1,0.95])
    plt.savefig(OUT_PDF, bbox_inches="tight")
    plt.savefig(OUT_SVG, bbox_inches="tight")
    plt.savefig(OUT_PNG, dpi=600, bbox_inches="tight")
    print(f"✅ 保存：{OUT_PDF} / {OUT_SVG} / {OUT_PNG}")

if __name__ == "__main__":
    main()




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 27223 samples in 0.001s...
[t-SNE] Computed neighbors for 27223 samples in 0.426s...
[t-SNE] Computed conditional probabilities for sample 1000 / 27223
[t-SNE] Computed conditional probabilities for sample 2000 / 27223
[t-SNE] Computed conditional probabilities for sample 3000 / 27223
[t-SNE] Computed conditional probabilities for sample 4000 / 27223
[t-SNE] Computed conditional probabilities for sample 5000 / 27223
[t-SNE] Computed conditional probabilities for sample 6000 / 27223
[t-SNE] Computed conditional probabilities for sample 7000 / 27223
[t-SNE] Computed conditional probabilities for sample 8000 / 27223
[t-SNE] Computed conditional probabilities for sample 9000 / 27223
[t-SNE] Computed conditional probabilities for sample 10000 / 27223
[t-SNE] Computed conditional probabilities for sample 11000 / 27223
[t-SNE] Computed conditional probabilities for sample 12000 / 27223
[t-SNE] Computed conditional probabilities for sam

  plt.tight_layout(rect=[0,0,1,0.95])
  plt.tight_layout(rect=[0,0,1,0.95])
  plt.savefig(OUT_PDF, bbox_inches="tight")
  plt.savefig(OUT_PDF, bbox_inches="tight")
  plt.savefig(OUT_PDF, bbox_inches="tight")
  plt.savefig(OUT_PDF, bbox_inches="tight")
  plt.savefig(OUT_SVG, bbox_inches="tight")
  plt.savefig(OUT_SVG, bbox_inches="tight")
  plt.savefig(OUT_PNG, dpi=600, bbox_inches="tight")
  plt.savefig(OUT_PNG, dpi=600, bbox_inches="tight")


✅ 保存：mbti_dims_tsne_top3.pdf / mbti_dims_tsne_top3.svg / mbti_dims_tsne_top3.png


In [11]:
import json
with open("train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(json.dumps(data[0], indent=2, ensure_ascii=False))
print(json.dumps(data[1], indent=2, ensure_ascii=False))


{
  "type": "ISFJ",
  "posts": "Hello everyone, and thank you so much for welcoming me here. It means a lot to find a place where I can share and learn alongside others who understand the quiet strength and careful kindness that often come with being an ISFJ. I’ve always found comfort in routines and in helping those around me, even when it means putting my own needs aside for a while. Sometimes it’s hard to speak up, especially when emotions run deep inside and I don’t want to burden anyone. But I’m learning that it’s okay to ask for support, too.  \n\nI’m currently studying veterinary medicine, which feels like a calling for me—it combines my love for caring with a practical path to make a difference. It’s a journey filled with late nights and lots of organization, but also moments of pure joy when I see the comfort a little animal can find through care and attention.  \n\nTo anyone else who sometimes feels overwhelmed by the weight of expectations or the chaos of the world, please k