In [1]:
# ========================
# Cell 1: 基础检查
# ========================
import spacy
from pathlib import Path
import os

# 查看 spaCy 版本和可用组件
spacy.info()

{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {'lv_xlmr_base_without_lemma': '1.0.0',
  'lv_spacy_without_lemma': '1.0.0',
  'lv_bert_without_lemma': '1.0.0'}}

In [2]:
# ========================
# Cell 2: 创建目录结构
# ========================
Path("./corpus").mkdir(parents=True, exist_ok=True)

# 原始训练结果和子目录
Path("./models").mkdir(parents=True, exist_ok=True)
Path("./models/spacy_lv").mkdir(parents=True, exist_ok=True)
Path("./models/xlmr_large_lv").mkdir(parents=True, exist_ok=True)

# config文件路径
Path("./config").mkdir(parents=True, exist_ok=True)

# 打包后的模型
Path("./packages").mkdir(parents=True, exist_ok=True)

print("✅ 基础目录结构已创建完成")

✅ 基础目录结构已创建完成


In [3]:
# ========================
# Cell 3: 转换 conllu 到 spaCy 格式
# ========================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10


[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [4]:
# ========================
# Cell 4: 初始化 XLM-R Large (transformer) config
# ========================
!python -m spacy init config ./config/config_xlmr_large.cfg --lang lv --pipeline transformer,tagger,morphologizer,parser,senter --optimize efficiency --gpu


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: GPU
- Transformer: bert-base-multilingual-uncased
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_xlmr_large.cfg
You can now add your data and train your pipeline:
python -m spacy train config_xlmr_large.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
# ========================
# Cell 5: 修改 config 数据路径 + transformer 设置
# ========================
cfg_text = Path("./config/config_xlmr_large.cfg").read_text(encoding="utf-8")

# 改训练/验证集路径
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")

# 改 transformer 模型
cfg_text = cfg_text.replace("bert-base-multilingual-uncased", "xlm-roberta-large")

# 开启混合精度
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

Path("./config/config_xlmr_large.cfg").write_text(cfg_text, encoding="utf-8")
print("✅ 已经修改 config_xlmr_large.cfg 的数据路径和 transformer 模型 (开启 mixed_precision)")


✅ 已经修改 config_xlmr_large.cfg 的数据路径和 transformer 模型 (开启 mixed_precision)


In [7]:
# ========================
# Cell 6: 训练 XLM-R Large 模型
# ========================
!python -m spacy train ./config/config_xlmr_large.cfg --output ./models/xlmr_large_lv --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: models/xlmr_large_lv[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
tokenizer_config.json: 100%|██████████████████| 25.0/25.0 [00:00<00:00, 300kB/s]
config.json: 100%|█████████████████████████████| 616/616 [00:00<00:00, 7.78MB/s]
sentencepiece.bpe.model: 100%|█████████████| 5.07M/5.07M [00:03<00:00, 1.30MB/s]
tokenizer.json: 100%|██████████████████████| 9.10M/9.10M [00:01<00:00, 5.64MB/s]
model.safetensors: 100%|███████████████████| 2.24G/2.24G [02:17<00:00, 16.3MB/s]
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  -------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0 

In [8]:
# ========================
# Cell 7: 模型评估
# ========================
!python -m spacy evaluate ./models/xlmr_large_lv/model-best ./corpus/lv_lvtb-ud-test.spacy --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      93.27
POS      98.14
MORPH    96.38
UAS      92.42
LAS      89.20
SENT P   98.07
SENT R   97.79
SENT F   97.93
SPEED    13502

[1m

                P       R       F
ExtPos      88.97   87.05   88.00
Case        98.64   97.87   98.25
Gender      97.85   96.97   97.41
Number      98.49   97.78   98.13
Person      99.05   99.03   99.04
PronType    98.65   98.20   98.43
Evident     99.27   99.14   99.21
Mood        98.74   98.74   98.74
Polarity    99.44   99.47   99.46
Tense       97.99   98.03   98.01
VerbForm    99.22   99.25   99.24
Voice       99.25   99.32   99.29
Definite    97.96   98.47   98.22
Degree      98.73   98.92   98.82
Poss       100.00   99.45   99.73
NumType    100.00   76.91   86.95
Reflex      99.35   99.35   99.35
Aspect      98.59   98.90   98.75
Foreign     94.74   85.26   89.75
Typo         0.00    0.00    0.00
Abbr        96.79   82.65   89.16

[1m

                    P        R        F
mark          

In [9]:
# ========================
# Cell 8: 保存不带 Lemma 的模型
# ========================
import spacy

# XLM-R Large
nlp_xlmr_large = spacy.load("./models/xlmr_large_lv/model-best")
nlp_xlmr_large.to_disk("./models/xlmr_large_lv/model-no-lemma")
print("✅ XLM-R Large 模型已保存到 ./models/xlmr_large_lv/model-no-lemma")


  from .autonotebook import tqdm as notebook_tqdm


✅ XLM-R Large 模型已保存到 ./models/xlmr_large_lv/model-no-lemma


In [10]:
# ========================
# Cell 8.5: 打包模型
# ========================
from spacy.cli.package import package
project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# XLM-R Large 打包
package(
    input_dir=Path("./models/xlmr_large_lv/model-no-lemma"),
    output_dir=Path("./packages"),
    name="xlmr_large_without_lemma",
    version="1.0.0",
    force=True
)
print("✅ 打包完成，可以在 packages 文件夹里找到")


/home/jesse/Projects/myenvs/spacy_lv/bin/python: No module named build


running sdist
running egg_info
creating lv_xlmr_large_without_lemma.egg-info
writing lv_xlmr_large_without_lemma.egg-info/PKG-INFO
writing dependency_links to lv_xlmr_large_without_lemma.egg-info/dependency_links.txt
writing entry points to lv_xlmr_large_without_lemma.egg-info/entry_points.txt
writing requirements to lv_xlmr_large_without_lemma.egg-info/requires.txt
writing top-level names to lv_xlmr_large_without_lemma.egg-info/top_level.txt
writing manifest file 'lv_xlmr_large_without_lemma.egg-info/SOURCES.txt'
reading manifest file 'lv_xlmr_large_without_lemma.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_xlmr_large_without_lemma.egg-info/SOURCES.txt'
running check
creating lv_xlmr_large_without_lemma-1.0.0
creating lv_xlmr_large_without_lemma-1.0.0/lv_xlmr_large_without_lemma
creating lv_xlmr_large_without_lemma-1.0.0/lv_xlmr_large_without_lemma.egg-info
creating lv_xlmr_large_without_lemma-1.0.0/lv_xlmr_large_without_lemma/lv_xlmr_large_w

In [11]:
# ========================
# Cell 9: 安装并测试模型
# ========================
import subprocess
import spacy
import pandas as pd

# 安装 XLM-R Large 模型
subprocess.run(["pip", "install", "./packages/lv_xlmr_large_without_lemma-1.0.0/dist/lv_xlmr_large_without_lemma-1.0.0.tar.gz"])

nlp_xlmr_large = spacy.load("lv_xlmr_large_without_lemma")
print("XLM-R Large Pipeline组件：", nlp_xlmr_large.pipe_names)

Processing ./packages/lv_xlmr_large_without_lemma-1.0.0/dist/lv_xlmr_large_without_lemma-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_xlmr_large_without_lemma
  Building wheel for lv_xlmr_large_without_lemma (setup.py): started


[33m  DEPRECATION: Building 'lv_xlmr_large_without_lemma' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_xlmr_large_without_lemma'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_xlmr_large_without_lemma (setup.py): still running...
  Building wheel for lv_xlmr_large_without_lemma (setup.py): finished with status 'done'
  Created wheel for lv_xlmr_large_without_lemma: filename=lv_xlmr_large_without_lemma-1.0.0-py3-none-any.whl size=1837770620 sha256=30d4de90d8c9d15fe532d2db453ffb437c5f04e6b0100f4ff82394e4aa4427a9
  Stored in directory: /home/jesse/.cache/pip/wheels/42/27/5e/d340d268b0de92e07470b2db27f79d5e2aebfa9983d45ae150
Successfully built lv_xlmr_large_without_lemma
Installing collected packages: lv_xlmr_large_without_lemma
Successfully installed lv_xlmr_large_without_lemma-1.0.0
XLM-R Large Pipeline组件： ['transformer', 'tagger', 'morphologizer', 'parser', 'senter']


In [12]:
# ========================
# Cell 10: 测试模型，不显示 Lemma，并显示分句
# ========================
import spacy
import pandas as pd

nlp = spacy.load("lv_xlmr_large_without_lemma")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 先生成 token 表格
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  # 显示 token 表格

# 再单独显示分句
print("\n✅ 分句结果：")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,POS,Dependency,Head
0,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,AUX,cop,galvaspilsēta
2,Latvijas,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,CCONJ,cc,viens
5,viens,NUM,conj,galvaspilsēta
6,no,ADP,case,centriem
7,galvenajiem,ADJ,amod,centriem
8,rūpniecības,NOUN,nmod,centriem
9,",",PUNCT,punct,darījumu



✅ 分句结果：
Sentence 1: Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta.
Sentence 2: Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā.
Sentence 3: Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju.
Sentence 4: Pilsētas teritorijas platība ir 307,17 km2.
Sentence 5: Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
