In [15]:
# ========================
# Cell 1: 基础检查
# ========================
import spacy
from pathlib import Path
import os

# 查看 spaCy 版本和可用组件
spacy.info()


{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {'lv_spacy_without_lemma': '1.0.0',
  'lv_bert_without_lemma': '1.0.0'}}

In [16]:
# ========================
# Cell 2: 创建目录结构
# ========================
Path("./corpus").mkdir(parents=True, exist_ok=True)

# 原始训练结果和子目录
Path("./models").mkdir(parents=True, exist_ok=True)
Path("./models/spacy_lv").mkdir(parents=True, exist_ok=True)
Path("./models/bert_lv").mkdir(parents=True, exist_ok=True)

# config文件路径
Path("./config").mkdir(parents=True, exist_ok=True)

# 打包后的模型
Path("./packages").mkdir(parents=True, exist_ok=True)

print("✅ 基础目录结构已创建完成")

✅ 基础目录结构已创建完成


In [17]:
# ========================
# Cell 3: 转换 conllu 到 spaCy 格式
# ========================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10



[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [18]:
# ========================
# Cell 4b: 初始化 BERT (transformer) config
# ========================
!python -m spacy init config ./config/config_bert.cfg --lang lv --pipeline transformer,tagger,morphologizer,parser,senter --optimize efficiency --gpu

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: GPU
- Transformer: bert-base-multilingual-uncased
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_bert.cfg
You can now add your data and train your pipeline:
python -m spacy train config_bert.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [19]:
# ========================
# Cell 5: 修改 config 数据路径
# ========================
cfg_text = Path("./config/config_bert.cfg").read_text(encoding="utf-8")
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")
Path("./config/config_bert.cfg").write_text(cfg_text, encoding="utf-8")

print("✅ 已经修改 config_bert.cfg 的数据路径")


✅ 已经修改 config_bert.cfg 的数据路径


In [20]:
# ========================
# Cell 6b: 训练 BERT 模型
# ========================
!python -m spacy train ./config/config_bert.cfg --output ./models/bert_lv --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: models/bert_lv[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  -------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0        2650.49      1300.29        1299.83      1962.18       652.50     0.00     3.90       2.73     6.67     6.67     0.02     0.29     0.03    0.02
  1     200      388050.86    357593.28      357083.65    404859.50    176529.80    27.87    70.60      41.86    63.24    45.38    21.08    37.21    26.91    0.41
  2     400      180263.52    334162.86      315734.74    208746.28    139036.42    40.75

In [21]:
# ========================
# Cell 7: 模型评估
# ========================
!python -m spacy evaluate ./models/bert_lv/model-best ./corpus/lv_lvtb-ud-test.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      89.77
POS      97.04
MORPH    93.43
UAS      87.97
LAS      83.41
SENT P   96.66
SENT R   95.53
SENT F   96.10
SPEED    14552

[1m

                P       R       F
ExtPos      92.54   89.21   90.84
Case        94.97   94.19   94.58
Gender      96.27   95.36   95.82
Number      96.95   96.17   96.56
Person      97.57   97.76   97.67
PronType    98.56   97.49   98.02
Evident     97.86   97.42   97.64
Mood        96.87   96.83   96.85
Polarity    98.68   98.51   98.59
Tense       94.71   94.26   94.49
VerbForm    97.66   97.49   97.58
Voice       98.06   97.86   97.96
Definite    92.85   92.95   92.90
Degree      97.43   97.62   97.52
Poss       100.00   99.45   99.73
NumType     99.62   77.81   87.37
Reflex      97.53   97.85   97.69
Aspect      97.38   96.39   96.88
Foreign     94.19   85.26   89.50
Typo         0.00    0.00    0.00
Abbr       100.00   78.08   87.69

[1m

                    P        R        F
mark          

In [22]:
# ========================
# Cell 8: 保存不带 Lemma 的模型
# ========================
import spacy

# BERT
nlp_bert = spacy.load("./models/bert_lv/model-best")
nlp_bert.to_disk("./models/bert_lv/model-no-lemma")
print("✅ BERT 模型已保存到 ./models/bert_lv/model-no-lemma")

  from .autonotebook import tqdm as notebook_tqdm


✅ BERT 模型已保存到 ./models/bert_lv/model-no-lemma


In [23]:
# ========================
# Cell 8.5: 打包模型
# ========================
from spacy.cli.package import package

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# BERT 打包
package(
    input_dir=Path("./models/bert_lv/model-no-lemma"),
    output_dir=Path("./packages"),
    name="bert_without_lemma",
    version="1.0.0",
    force=True
)

print("✅ 打包完成，可以在 packages 文件夹里找到")

/home/jesse/Projects/myenvs/spacy_lv/bin/python: No module named build


running sdist
running egg_info
creating lv_bert_without_lemma.egg-info
writing lv_bert_without_lemma.egg-info/PKG-INFO
writing dependency_links to lv_bert_without_lemma.egg-info/dependency_links.txt
writing entry points to lv_bert_without_lemma.egg-info/entry_points.txt
writing requirements to lv_bert_without_lemma.egg-info/requires.txt
writing top-level names to lv_bert_without_lemma.egg-info/top_level.txt
writing manifest file 'lv_bert_without_lemma.egg-info/SOURCES.txt'
reading manifest file 'lv_bert_without_lemma.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_bert_without_lemma.egg-info/SOURCES.txt'
running check
creating lv_bert_without_lemma-1.0.0
creating lv_bert_without_lemma-1.0.0/lv_bert_without_lemma
creating lv_bert_without_lemma-1.0.0/lv_bert_without_lemma.egg-info
creating lv_bert_without_lemma-1.0.0/lv_bert_without_lemma/lv_bert_without_lemma-1.0.0
creating lv_bert_without_lemma-1.0.0/lv_bert_without_lemma/lv_bert_without_lemma-1.

In [24]:
# ========================
# Cell 9: 安装并测试两个模型
# ========================
import subprocess
import spacy

# 安装 BERT 模型
subprocess.run(["pip", "install", "./packages/lv_bert_without_lemma-1.0.0/dist/lv_bert_without_lemma-1.0.0.tar.gz"])
nlp_bert = spacy.load("lv_bert_without_lemma")

print("BERT Pipeline组件：", nlp_bert.pipe_names)

Processing ./packages/lv_bert_without_lemma-1.0.0/dist/lv_bert_without_lemma-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_bert_without_lemma
  Building wheel for lv_bert_without_lemma (setup.py): started


[33m  DEPRECATION: Building 'lv_bert_without_lemma' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_bert_without_lemma'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_bert_without_lemma (setup.py): finished with status 'done'
  Created wheel for lv_bert_without_lemma: filename=lv_bert_without_lemma-1.0.0-py3-none-any.whl size=632584490 sha256=79bc935719125226f164dcdd655017b79a34f0fd7df47db87ce0bae941649da4
  Stored in directory: /home/jesse/.cache/pip/wheels/75/ef/9c/f29fc872acb8e88f6868d8331ffb794ac82cb17eae0a643a9c
Successfully built lv_bert_without_lemma
Installing collected packages: lv_bert_without_lemma
  Attempting uninstall: lv_bert_without_lemma
    Found existing installation: lv_bert_without_lemma 1.0.0
    Uninstalling lv_bert_without_lemma-1.0.0:
      Successfully uninstalled lv_bert_without_lemma-1.0.0
Successfully installed lv_bert_without_lemma-1.0.0
BERT Pipeline组件： ['transformer', 'tagger', 'morphologizer', 'parser', 'senter']


In [25]:
# ========================
# Cell 10: 测试模型，不显示 Lemma，并显示分句
# ========================
import spacy
import pandas as pd

nlp = spacy.load("lv_bert_without_lemma")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 先生成 token 表格
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  # 显示 token 表格

# 再单独显示分句
print("\n✅ 分句结果：")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")



Unnamed: 0,Text,POS,Dependency,Head
0,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,AUX,cop,galvaspilsēta
2,Latvijas,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,CCONJ,cc,viens
5,viens,NUM,conj,galvaspilsēta
6,no,ADP,case,centriem
7,galvenajiem,ADJ,amod,centriem
8,rūpniecības,NOUN,nmod,centriem
9,",",PUNCT,punct,darījumu



✅ 分句结果：
Sentence 1: Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta.
Sentence 2: Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā.
Sentence 3: Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju.
Sentence 4: Pilsētas teritorijas platība ir 307,17 km2.
Sentence 5: Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
