In [15]:
#!pip install rapidfuzz
from rapidfuzz.distance import Levenshtein
import re
from pathlib import Path
import pandas as pd

In [None]:

# ===== 你的文件路径 =====
wat_path = "###/annotated data.csv"
llm_paths = {
    "Qwen": "###/cleaned1_qwen.csv",
    "Aya-23": "###/annotation/aya8b/aya23_clean.csv",
    "DeepSeek": "###/deepseek_clean2_output.csv",
    "GPT-4o mini": "###/rephrased_preview_gpt4o-mini.csv",
    "LLaMA3": "###/llama3_processed_output.csv",
    "Gemini": "/###/Gemini.cleaned.csv"
}

In [17]:
# ===== Levenshtein（优先 rapidfuzz）=====
try:
    from rapidfuzz.distance import Levenshtein as RFLev
    def lev_distance(a: str, b: str) -> int:
        return RFLev.distance(a, b)
except Exception:
    # 纯 Python DP 备选（慢，但可用）
    def lev_distance(a: str, b: str) -> int:
        n, m = len(a), len(b)
        if n == 0: return m
        if m == 0: return n
        prev = list(range(m + 1))
        curr = [0] * (m + 1)
        for i in range(1, n + 1):
            curr[0] = i
            ca = a[i - 1]
            for j in range(1, m + 1):
                cb = b[j - 1]
                cost = 0 if ca == cb else 1
                curr[j] = min(prev[j] + 1,        # 删除
                              curr[j - 1] + 1,    # 插入
                              prev[j - 1] + cost) # 替换
            prev, curr = curr, prev
        return prev[m]

In [18]:
def lev_norm(a: str, b: str) -> float:
    a = "" if a is None else str(a)
    b = "" if b is None else str(b)
    if not a and not b:
        return 0.0
    return lev_distance(a, b) / max(len(a), len(b), 1)

In [19]:
def process_one_csv(csv_path: str):
    p = Path(csv_path)
    # 尝试多种编码避免读取失败
    for enc in [None, "utf-8", "utf-8-sig", "latin1"]:
        try:
            df = pd.read_csv(p, encoding=enc) if enc else pd.read_csv(p)
            break
        except Exception:
            df = None
    if df is None or df.shape[1] < 2:
        print(f"[跳过] {p.name}: 读取失败或列数不足。")
        return

    original_col = df.columns[0]
    tool_cols = list(df.columns[1:])

    # 结果矩阵：与输入相同行数、列=工具列
    result = pd.DataFrame(index=df.index, columns=tool_cols, dtype=float)

    for col in tool_cols:
        vals = []
        for a, b in zip(df[original_col], df[col]):
            if pd.isna(a) or pd.isna(b):
                vals.append(None)
            else:
                vals.append(lev_norm(str(a), str(b)))
        result[col] = vals

    out_path = p.with_name(f"{p.stem}_levenshtein.csv")
    # 保存为纯数字（保留 NaN）
    result.to_csv(out_path, index=False)
    print(f"✅ {p.name} → {out_path.name}")

In [20]:
# ===== 跑 WAT 文件 =====
process_one_csv(wat_path)

✅ annotated data.csv → annotated data_levenshtein.csv


In [21]:
# ===== 跑所有 LLM 文件 =====
for name, path in llm_paths.items():
    process_one_csv(path)


✅ cleaned1_qwen.csv → cleaned1_qwen_levenshtein.csv
✅ aya23_clean.csv → aya23_clean_levenshtein.csv
✅ deepseek_clean2_output.csv → deepseek_clean2_output_levenshtein.csv
✅ rephrased_preview_gpt4o-mini.csv → rephrased_preview_gpt4o-mini_levenshtein.csv
✅ llama3_processed_output.csv → llama3_processed_output_levenshtein.csv
✅ Gemini.cleaned.csv → Gemini.cleaned_levenshtein.csv
