In [1]:
import cupy

print(cupy.show_config())


OS                           : Linux-6.14.0-29-generic-x86_64-with-glibc2.39
Python Version               : 3.12.3
CuPy Version                 : 12.3.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 1.26.4
SciPy Version                : None
Cython Build Version         : 0.29.36
Cython Runtime Version       : None
CUDA Root                    : /usr/local/cuda
nvcc PATH                    : /usr/local/cuda/bin/nvcc
CUDA Build Version           : 12020
CUDA Driver Version          : 12090
CUDA Runtime Version         : 12090
cuBLAS Version               : (available)
cuFFT Version                : 11401
cuRAND Version               : 10310
cuSOLVER Version             : (11, 7, 5)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 9)
Thrust Version               : 200101
CUB Build Version            : 200101
Jitify Build Version         : <unknown>
cuDNN Build Version          : (not loaded; try `import cupy.cuda.cudnn` first)
cuD

In [2]:
# ========================
# Cell 1: 基础检查
# ========================
import spacy
from pathlib import Path
import os

# 查看 spaCy 版本和可用组件
spacy.info()


{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {'lv_spacy_without_lemma': '1.0.0'}}

In [3]:
# ========================
# Cell 2: 创建 corpus 和 models 文件夹
# ========================
Path("./corpus").mkdir(parents=True, exist_ok=True)

# 原始训练结果和子目录
Path("./models").mkdir(parents=True, exist_ok=True)
Path("./models/spacy_lv").mkdir(parents=True, exist_ok=True)
Path("./models/bert_lv").mkdir(parents=True, exist_ok=True)

# 打包后的模型
Path("./packages").mkdir(parents=True, exist_ok=True)

# config文件路径
Path("./config").mkdir(parents=True, exist_ok=True)

print("✅ 基础目录结构已创建完成")


✅ 基础目录结构已创建完成


In [4]:
# ========================
# Cell 3: 转换 conllu 到 spaCy 格式
# ========================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10


[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [5]:
# ========================
# Cell 4: 初始化 config
# ========================
!python -m spacy init config ./config/config_spacy.cfg --lang lv --pipeline tok2vec,tagger,morphologizer,parser,senter --optimize efficiency


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_spacy.cfg
You can now add your data and train your pipeline:
python -m spacy train config_spacy.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
# ========================
# Cell 5: 修改 config 数据路径
# ========================
cfg_text = Path("./config/config_spacy.cfg").read_text(encoding="utf-8")
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")
Path("./config/config_spacy.cfg").write_text(cfg_text, encoding="utf-8")
print("✅ 已经修改 config_spacy.cfg 的数据路径")


✅ 已经修改 config_spacy.cfg 的数据路径


In [7]:
# ========================
# Cell 6: 使用 GPU 训练（没有 GPU 就去掉 --gpu-id 0）
# ========================
!python -m spacy train ./config/config_spacy.cfg --output ./models/spacy_lv --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: models/spacy_lv[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  ------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0          0.00       202.94         202.86       447.71       112.50    18.53    31.64      21.10     8.67     6.82     0.01     0.19     0.02    0.13
  0     200       3305.41     23149.60       21819.37     33044.39      2844.51    51.88    77.96      58.21    49.07    32.02    42.21    59.04    49.23    0.52
  0     400       6583.46     16279.65       14510.06     29139.23       258.41    65.10    85

In [8]:
# ========================
# Cell 7: 模型评估
# ========================
!python -m spacy evaluate ./models/spacy_lv/model-best ./corpus/lv_lvtb-ud-test.spacy


[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      99.53
TAG      86.65
POS      94.77
MORPH    90.55
UAS      80.83
LAS      75.28
SENT P   96.63
SENT R   95.87
SENT F   96.25
SPEED    14141

[1m

                P       R       F
ExtPos      92.00   82.73   87.12
Case        91.86   91.38   91.62
Gender      93.33   92.72   93.02
Number      93.92   92.99   93.45
Person      94.71   93.79   94.24
PronType    98.33   97.74   98.04
Evident     93.76   93.00   93.38
Mood        93.10   92.03   92.56
Polarity    93.66   92.79   93.22
Tense       90.73   89.35   90.03
VerbForm    92.58   91.72   92.15
Voice       93.53   91.89   92.70
Definite    90.17   87.94   89.04
Degree      92.85   90.79   91.81
Poss       100.00   98.91   99.45
NumType     99.22   76.76   86.56
Reflex      92.70   92.80   92.75
Aspect      88.96   86.66   87.79
Foreign     78.99   49.47   60.84
Typo         0.00    0.00    0.00
Abbr        94.02   79.00   85.86



In [9]:
# ========================
# Cell 8: 不加入 Stanza lemmatizer，直接保存模型
# ========================
import spacy

# 加载训练好的模型
nlp = spacy.load("./models/spacy_lv/model-best")

# 保存到新路径（可选）
nlp.to_disk("./models/spacy_lv/model-no-lemma")
print("✅ 模型已保存到 ./models/spacy_lv/model-no-lemma")



  from .autonotebook import tqdm as notebook_tqdm


✅ 模型已保存到 ./models/spacy_lv/model-no-lemma


In [10]:
# ========================
# Cell 8.5: Notebook 打包修正版（使用相对路径）
# ========================
from spacy.cli.package import package
import os
from pathlib import Path

# 创建 packages 文件夹
Path("./packages").mkdir(parents=True, exist_ok=True)

# 设置相对路径到项目根目录（当前 notebook 所在目录的上一级或你项目目录）
project_root = Path(".").resolve()  # 当前目录，也可以改成 Path("..").resolve() 如果 notebook 在子目录
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

input_dir = Path("./models/spacy_lv/model-no-lemma")
output_dir = Path("./packages")

package(
    input_dir=input_dir,
    output_dir=output_dir,
    name="spacy_without_lemma",
    version="1.0.0",
    force=True
)


print("✅ 打包完成，可以在 packages 文件夹里找到")


/home/jesse/Projects/myenvs/spacy_lv/bin/python: No module named build


running sdist
running egg_info
creating lv_spacy_without_lemma.egg-info
writing lv_spacy_without_lemma.egg-info/PKG-INFO
writing dependency_links to lv_spacy_without_lemma.egg-info/dependency_links.txt
writing entry points to lv_spacy_without_lemma.egg-info/entry_points.txt
writing top-level names to lv_spacy_without_lemma.egg-info/top_level.txt
writing manifest file 'lv_spacy_without_lemma.egg-info/SOURCES.txt'
reading manifest file 'lv_spacy_without_lemma.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_spacy_without_lemma.egg-info/SOURCES.txt'
running check
creating lv_spacy_without_lemma-1.0.0
creating lv_spacy_without_lemma-1.0.0/lv_spacy_without_lemma
creating lv_spacy_without_lemma-1.0.0/lv_spacy_without_lemma.egg-info
creating lv_spacy_without_lemma-1.0.0/lv_spacy_without_lemma/lv_spacy_without_lemma-1.0.0
creating lv_spacy_without_lemma-1.0.0/lv_spacy_without_lemma/lv_spacy_without_lemma-1.0.0/morphologizer
creating lv_spacy_without_lemma

In [11]:
# ========================
# Cell 9: 安装并测试打包好的拉脱维亚语模型
# ========================
import subprocess
import spacy

package_path = "./packages/lv_spacy_without_lemma-1.0.0/dist/lv_spacy_without_lemma-1.0.0.tar.gz"

# 安装模型
subprocess.run(["pip", "install", package_path])

# 加载模型
nlp = spacy.load("lv_spacy_without_lemma")

# 查看 pipeline 组件
print("Pipeline组件：", nlp.pipe_names)
print("组件数量：", len(nlp.pipe_names))


Processing ./packages/lv_spacy_without_lemma-1.0.0/dist/lv_spacy_without_lemma-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_spacy_without_lemma
  Building wheel for lv_spacy_without_lemma (setup.py): started


[33m  DEPRECATION: Building 'lv_spacy_without_lemma' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_spacy_without_lemma'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_spacy_without_lemma (setup.py): finished with status 'done'
  Created wheel for lv_spacy_without_lemma: filename=lv_spacy_without_lemma-1.0.0-py3-none-any.whl size=8780808 sha256=41591a74fdbc9626eecba839313117b338a0a32ef5b63112ac058e2be479e5d6
  Stored in directory: /home/jesse/.cache/pip/wheels/3e/cc/0f/7173eab8f23f75c757630bd50b75a461e61b787b325abb8ae5
Successfully built lv_spacy_without_lemma
Installing collected packages: lv_spacy_without_lemma
  Attempting uninstall: lv_spacy_without_lemma
    Found existing installation: lv_spacy_without_lemma 1.0.0
    Uninstalling lv_spacy_without_lemma-1.0.0:
      Successfully uninstalled lv_spacy_without_lemma-1.0.0
Successfully installed lv_spacy_without_lemma-1.0.0
Pipeline组件： ['tok2vec', 'tagger', 'morphologizer', 'parser', 'senter']
组件数量： 5


In [12]:
# ========================
# Cell 10: 测试模型，不显示 Lemma，并显示分句
# ========================
import spacy
import pandas as pd

nlp = spacy.load("lv_spacy_without_lemma")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 先生成 token 表格
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  # 显示 token 表格

# 再单独显示分句
print("\n✅ 分句结果：")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")



Unnamed: 0,Text,POS,Dependency,Head
0,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,AUX,cop,galvaspilsēta
2,Latvijas,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,CCONJ,cc,viens
5,viens,NUM,conj,galvaspilsēta
6,no,ADP,case,centriem
7,galvenajiem,ADJ,amod,centriem
8,rūpniecības,NOUN,nmod,centriem
9,",",PUNCT,punct,darījumu



✅ 分句结果：
Sentence 1: Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta.
Sentence 2: Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā.
Sentence 3: Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju.
Sentence 4: Pilsētas teritorijas platība ir 307,17 km2.
Sentence 5: Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
