In [1]:
# ========================
# Cell 1: 基础检查
# ========================
import spacy
from pathlib import Path
import os

# 查看 spaCy 版本和可用组件
spacy.info()

{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {'lv_spacy_without_lemma': '1.0.0',
  'lv_bert_without_lemma': '1.0.0'}}

In [2]:
# ========================
# Cell 2: 创建目录结构
# ========================
Path("./corpus").mkdir(parents=True, exist_ok=True)

# 原始训练结果和子目录
Path("./models").mkdir(parents=True, exist_ok=True)
Path("./models/spacy_lv").mkdir(parents=True, exist_ok=True)
Path("./models/xlmr_base_lv").mkdir(parents=True, exist_ok=True)   # 用 xlmr_lv 区分开

# config文件路径
Path("./config").mkdir(parents=True, exist_ok=True)

# 打包后的模型
Path("./packages").mkdir(parents=True, exist_ok=True)

print("✅ 基础目录结构已创建完成")

✅ 基础目录结构已创建完成


In [3]:
# ========================
# Cell 3: 转换 conllu 到 spaCy 格式
# ========================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10

[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [4]:
# ========================
# Cell 4c: 初始化 XLM-R (transformer) config
# ========================
!python -m spacy init config ./config/config_xlmr_base.cfg --lang lv --pipeline transformer,tagger,morphologizer,parser,senter --optimize efficiency --gpu


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: GPU
- Transformer: bert-base-multilingual-uncased
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_xlmr_base.cfg
You can now add your data and train your pipeline:
python -m spacy train config_xlmr_base.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
# ========================
# Cell 5: 修改 config 数据路径 + transformer 模型 + 开启混合精度
# ========================
cfg_path = Path("./config/config_xlmr_base.cfg")
cfg_text = cfg_path.read_text(encoding="utf-8")

# 改训练/验证集路径
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")

# 改 transformer 模型：换成 xlm-roberta-base
cfg_text = cfg_text.replace("bert-base-multilingual-uncased", "xlm-roberta-base")

# 开启混合精度训练
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

cfg_path.write_text(cfg_text, encoding="utf-8")
print("✅ 已经修改 config_xlmr_base.cfg 的数据路径、transformer 模型，并启用混合精度")


✅ 已经修改 config_xlmr_base.cfg 的数据路径、transformer 模型，并启用混合精度


In [9]:
# ========================
# Cell 6b: 训练 XLM-R 模型
# ========================
!python -m spacy train ./config/config_xlmr_base.cfg --output ./models/xlmr_base_lv --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: models/xlmr_base_lv[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  -------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0        3945.17      1300.29        1299.83      2087.24       652.50     0.00     3.90       2.73     6.67     6.67     0.02     0.29     0.03    0.02
  1     200      451982.40    357207.15      356374.43    425616.31    176529.80    17.50    22.76      34.26    42.02    24.16     1.23     3.32     1.79    0.20
  2     400      222641.86    321050.97      293458.02    239305.30    139036.42    

In [10]:
# ========================
# Cell 7: 模型评估
# ========================
!python -m spacy evaluate ./models/xlmr_base_lv/model-best ./corpus/lv_lvtb-ud-test.spacy --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      92.46
POS      97.93
MORPH    95.89
UAS      91.06
LAS      87.59
SENT P   98.45
SENT R   98.21
SENT F   98.33
SPEED    19088

[1m

                P        R        F
ExtPos      91.24    89.93    90.58
Case        98.29    97.43    97.86
Gender      97.36    96.32    96.84
Number      98.32    97.46    97.89
Person      98.60    98.71    98.65
PronType    98.60    98.23    98.41
Evident     99.00    98.68    98.84
Mood        98.13    98.38    98.25
Polarity    99.13    99.25    99.19
Tense       97.40    97.25    97.33
VerbForm    98.73    98.85    98.79
Voice       98.91    99.12    99.01
Definite    97.95    97.95    97.95
Degree      98.62    98.56    98.59
Poss       100.00   100.00   100.00
NumType     99.23    77.36    86.94
Reflex      98.92    98.92    98.92
Aspect      98.42    98.04    98.23
Foreign     95.76    83.16    89.01
Typo         0.00     0.00     0.00
Abbr        97.31    82.65    89.38

[1m

          

In [11]:
# ========================
# Cell 8: 保存不带 Lemma 的模型
# ========================
import spacy

# XLM-R
nlp_xlmr = spacy.load("./models/xlmr_base_lv/model-best")
nlp_xlmr.to_disk("./models/xlmr_base_lv/model-no-lemma")
print("✅ XLM-R 模型已保存到 ./models/xlmr_base_lv/model-no-lemma")


  from .autonotebook import tqdm as notebook_tqdm


✅ XLM-R 模型已保存到 ./models/xlmr_base_lv/model-no-lemma


In [12]:
# ========================
# Cell 8.5: 打包模型
# ========================
from spacy.cli.package import package

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# XLM-R 打包
package(
    input_dir=Path("./models/xlmr_base_lv/model-no-lemma"),
    output_dir=Path("./packages"),
    name="xlmr_base_without_lemma",
    version="1.0.0",
    force=True
)

print("✅ 打包完成，可以在 packages 文件夹里找到")

/home/jesse/Projects/myenvs/spacy_lv/bin/python: No module named build


running sdist
running egg_info
creating lv_xlmr_base_without_lemma.egg-info
writing lv_xlmr_base_without_lemma.egg-info/PKG-INFO
writing dependency_links to lv_xlmr_base_without_lemma.egg-info/dependency_links.txt
writing entry points to lv_xlmr_base_without_lemma.egg-info/entry_points.txt
writing requirements to lv_xlmr_base_without_lemma.egg-info/requires.txt
writing top-level names to lv_xlmr_base_without_lemma.egg-info/top_level.txt
writing manifest file 'lv_xlmr_base_without_lemma.egg-info/SOURCES.txt'
reading manifest file 'lv_xlmr_base_without_lemma.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_xlmr_base_without_lemma.egg-info/SOURCES.txt'
running check
creating lv_xlmr_base_without_lemma-1.0.0
creating lv_xlmr_base_without_lemma-1.0.0/lv_xlmr_base_without_lemma
creating lv_xlmr_base_without_lemma-1.0.0/lv_xlmr_base_without_lemma.egg-info
creating lv_xlmr_base_without_lemma-1.0.0/lv_xlmr_base_without_lemma/lv_xlmr_base_without_lemma-1.0.

In [13]:
# ========================
# Cell 9: 安装并测试两个模型
# ========================
import subprocess
import spacy
import pandas as pd

# 安装 XLM-R 模型
subprocess.run(["pip", "install", "./packages/lv_xlmr_base_without_lemma-1.0.0/dist/lv_xlmr_base_without_lemma-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_xlmr_base_without_lemma")

print("XLM-R Pipeline组件：", nlp_xlmr.pipe_names)

Processing ./packages/lv_xlmr_base_without_lemma-1.0.0/dist/lv_xlmr_base_without_lemma-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_xlmr_base_without_lemma
  Building wheel for lv_xlmr_base_without_lemma (setup.py): started


[33m  DEPRECATION: Building 'lv_xlmr_base_without_lemma' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_xlmr_base_without_lemma'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_xlmr_base_without_lemma (setup.py): finished with status 'done'
  Created wheel for lv_xlmr_base_without_lemma: filename=lv_xlmr_base_without_lemma-1.0.0-py3-none-any.whl size=858169812 sha256=f3f190a6b43e0a4d3766a579c38a8677d10ec8839b6f3efc8f559aa68990003a
  Stored in directory: /home/jesse/.cache/pip/wheels/02/4b/59/5c26330bbc48926d830e8855d5915e09f9def49f6fba501d79
Successfully built lv_xlmr_base_without_lemma
Installing collected packages: lv_xlmr_base_without_lemma
Successfully installed lv_xlmr_base_without_lemma-1.0.0
XLM-R Pipeline组件： ['transformer', 'tagger', 'morphologizer', 'parser', 'senter']


In [14]:
# ========================
# Cell 10: 测试模型，不显示 Lemma，并显示分句
# ========================
import spacy
import pandas as pd

nlp = spacy.load("lv_xlmr_base_without_lemma")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 先生成 token 表格
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  # 显示 token 表格

# 再单独显示分句
print("\n✅ 分句结果：")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,POS,Dependency,Head
0,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,AUX,cop,galvaspilsēta
2,Latvijas,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,CCONJ,cc,viens
5,viens,NUM,conj,galvaspilsēta
6,no,ADP,case,centriem
7,galvenajiem,ADJ,amod,centriem
8,rūpniecības,NOUN,nmod,centriem
9,",",PUNCT,punct,darījumu



✅ 分句结果：
Sentence 1: Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta.
Sentence 2: Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā.
Sentence 3: Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju.
Sentence 4: Pilsētas teritorijas platība ir 307,17 km2.
Sentence 5: Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
