In [1]:
# Cell 1:
import json
from pathlib import Path


In [2]:
# Cell 2: 
def extract_lemma_pairs(conllu_file):
    """从 .conllu 文件里提取 (form, lemma) 对"""
    pairs = []
    with open(conllu_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            cols = line.strip().split("\t")
            if len(cols) < 3:
                continue
            form, lemma = cols[1], cols[2]
            if lemma != "_":  # 确保有 lemma
                pairs.append((form, lemma))
    return pairs


In [3]:
# Cell 3: 
# 你的 UD 文件路径
ud_dir = Path("./ud_latvian")

files = [
    ud_dir / "lv_lvtb-ud-train.conllu",
    ud_dir / "lv_lvtb-ud-dev.conllu",
    ud_dir / "lv_lvtb-ud-test.conllu"
]

lemma_pairs = []
for f in files:
    lemma_pairs.extend(extract_lemma_pairs(f))

print(f"总共提取到 {len(lemma_pairs)} 个 (form, lemma) 对")


总共提取到 329882 个 (form, lemma) 对


In [4]:
# Cell 4:
# 转换为 dict: word -> lemma
lookup_dict = {}
for form, lemma in lemma_pairs:
    if form not in lookup_dict:
        lookup_dict[form] = lemma


In [5]:
# Cell 5:
with open("lookup.json", "w", encoding="utf-8") as f:
    json.dump({"lemma_lookup": lookup_dict}, f, ensure_ascii=False, indent=2)

print("✅ 已经生成 lookup.json")



✅ 已经生成 lookup.json


In [6]:
# Cell 6:
with open("lookup.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for i, (word, lemma) in enumerate(data["lemma_lookup"].items()):
    print(word, "->", lemma)
    if i > 20:  # 打印前 20 个
        break


Hēra -> Hēra
valda -> valdīt
augstajā -> augsts
Olimpā -> Olimps
. -> .
Viņa -> viņa
, -> ,
tāpat -> tāpat
kā -> kā
viņas -> viņa
vīrs -> vīrs
Zevs -> Zevs
ir -> būt
pavēlniece -> pavēlniece
pār -> pār
pērkoniem -> pērkons
un -> un
zibeņiem -> zibens
; -> ;
paklausot -> paklausīt
vārdam -> vārds
tumšo -> tumšs
