In [1]:
import cupy

print(cupy.show_config())


OS                           : Linux-6.14.0-29-generic-x86_64-with-glibc2.39
Python Version               : 3.12.3
CuPy Version                 : 13.6.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.3.2
SciPy Version                : None
Cython Build Version         : 3.0.12
Cython Runtime Version       : None
CUDA Root                    : /usr/local/cuda-12.8
nvcc PATH                    : /usr/local/cuda-12.8/bin/nvcc
CUDA Build Version           : 12090
CUDA Driver Version          : 12080
CUDA Runtime Version         : 12090 (linked to CuPy) / 12080 (locally installed)
CUDA Extra Include Dirs      : ['/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/nvidia/cuda_runtime/include']
cuBLAS Version               : (available)
cuFFT Version                : 11303
cuRAND Version               : 10309
cuSOLVER Version             : (11, 7, 3)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 8)
Thrust Version    

In [2]:
# Cell 1: 
import spacy 
from pathlib import Path 
import os 
# 查看 spaCy 版本和可用组件 
spacy.info()

{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {}}

In [3]:
# Cell 2:
# 创建 corpus 和 models 文件夹
Path("./corpus").mkdir(parents=True, exist_ok=True)
Path("./models").mkdir(parents=True, exist_ok=True)


In [4]:
# Cell 3:
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus


[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (15055 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2080 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2396 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [5]:
#Cell 4: 
!python -m spacy init config config.cfg --lang lv --pipeline tok2vec,tagger,morphologizer,parser --optimize efficiency


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
# Cell 5: 
cfg_text = Path("config.cfg").read_text(encoding="utf-8")
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")
Path("config.cfg").write_text(cfg_text, encoding="utf-8")
print("✅ 已经修改 config.cfg 的数据路径")


✅ 已经修改 config.cfg 的数据路径


In [7]:
# Cell 6: 使用 GPU 训练 
!python -m spacy train config.cfg --output ./models --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_F  SCORE 
---  ------  ------------  -----------  -------------  -----------  -------  -------  ---------  -------  -------  -------  ------
  0       0          0.00        87.49          87.46       278.06    14.64    25.48      15.06     7.89     6.41     0.08    0.14
  0     200       2720.41     13465.03       12743.87     24709.62    46.00    74.19      51.85    47.61    29.28    80.65    0.50
  0     400       5448.86     12816.42       11640.31     23172.35    60.89    83.48      68.28    59.32    43.56    41.29    0.63
  0     600       7603.59     12742.24       10986.01     25559.40    68.95    87.58      76.15    63.37  

In [10]:
# Cell 6.5: 手动把 lookup.json 挂进模型
import shutil
from pathlib import Path

# 确保目录存在
Path("models/model-best/lookups").mkdir(parents=True, exist_ok=True)
Path("models/model-last/lookups").mkdir(parents=True, exist_ok=True)

# 拷贝 lookup.json 到两个模型里
shutil.copy("lookup.json", "models/model-best/lookups/lemma_lookup.json")
shutil.copy("lookup.json", "models/model-last/lookups/lemma_lookup.json")

print("✅ 已经把 lookup.json 挂到 model-best 和 model-last")


✅ 已经把 lookup.json 挂到 model-best 和 model-last


In [11]:
# Cell 7: 
!python -m spacy evaluate ./models/model-best ./corpus/lv_lvtb-ud-test.spacy


[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      99.53 
TAG      86.95 
POS      94.85 
MORPH    90.84 
UAS      81.23 
LAS      75.64 
SENT P   100.00
SENT R   100.00
SENT F   100.00
SPEED    14303 

[1m

               P       R       F
ExtPos     90.91   86.33   88.56
Case       92.22   91.58   91.90
Gender     93.73   92.99   93.36
Number     94.09   92.90   93.49
Person     95.30   93.98   94.63
PronType   98.42   97.86   98.14
Evident    94.35   93.03   93.69
Mood       93.73   92.05   92.88
Polarity   94.09   92.59   93.33
Tense      91.23   89.06   90.13
VerbForm   92.75   91.25   91.99
Voice      93.83   91.34   92.57
Definite   90.44   87.94   89.17
Degree     92.67   90.84   91.75
Poss       97.85   99.45   98.64
NumType    98.85   77.06   86.60
Reflex     93.39   92.69   93.04
Aspect     89.07   85.09   87.03
Foreign    79.85   56.32   66.05
Typo        0.00    0.00    0.00
Abbr       96.22   81.28   88.12

[1m

       

In [20]:
# Cell 8: 打包模型
from pathlib import Path

# 确保 packages 文件夹存在
Path("./packages").mkdir(parents=True, exist_ok=True)

# 打包模型，注意包名不要和路径重复
!python -m spacy package ./models/model-best ./packages --name latvian_model --version 1.0.0 --force


  parser = self.make_parser(ctx)
  self.parse_args(ctx, args)
[38;5;4mℹ Building package artifacts: sdist[0m
[38;5;2m✔ Including 1 package requirement(s) from meta and config[0m
spacy>=3.8.7,<3.9.0
[38;5;2m✔ Loaded meta.json from file[0m
models/model-best/meta.json
[38;5;2m✔ Generated README.md from meta.json[0m
[38;5;2m✔ Successfully created package directory 'lv_latvian_model-1.0.0'[0m
packages/lv_latvian_model-1.0.0
[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for sdist...[0m
running egg_info
creating lv_latvian_model.egg-info
writing lv_latvian_model.egg-info/PKG-INFO
writing dependency_links to lv_latvian_model.egg-info/dependency_links.txt
writing entry points to lv_latvian_model.egg-info/entry_points.txt
writing requirements to lv_latvian_model.egg-info/requires.txt
writing top-level names to lv_latvian_model.egg-info/top_level.txt
writing manifest 

In [None]:
# Cell 9: 安装并测试打包好的拉脱维亚语模型
import subprocess
import spacy

# 安装打包好的模型
package_path = "./packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz"

# 使用 subprocess 调用 pip 安装（Notebook 内直接运行也可以）
subprocess.run(["pip", "install", package_path])



Processing ./packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: lv_latvian_model
  Building wheel for lv_latvian_model (pyproject.toml): started
  Building wheel for lv_latvian_model (pyproject.toml): finished with status 'done'
  Created wheel for lv_latvian_model: filename=lv_latvian_model-1.0.0-py3-none-any.whl size=9049687 sha256=a0a1052f2727ac346c3e7e71e3d3f442223bc9aad7b95dc1b477f34e6fcfe5f4
  Stored in directory: /home/jesse/.cache/pip/wheels/7a/9e/71/1e57930844b68e7ad0ecd8336bccc8e0cf32783841865bc649
Successfully built lv_latvian_model
Installing collected packages: lv_latvian_model
  Attempting uni

CompletedProcess(args=['pip', 'install', './packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz'], returncode=0)

In [None]:
# 加载模型
# 安装后，包名是 lv_latvian_model
nlp = spacy.load("lv_latvian_model")

# 测试文本
# 你可以用 wiki 或者任何拉脱维亚语文本，这里举个示例
text = """
Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
"""

doc = nlp(text)

# 输出分析结果
print("TOKEN\tLEMMA\tPOS\tTAG\tDEP\tHEAD")
for token in doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.tag_}\t{token.dep_}\t{token.head.text}")

# 句子划分
print("\n句子划分:")
for sent in doc.sents:
    print(sent.text)


TOKEN	LEMMA	POS	TAG	DEP	HEAD

		NOUN	ncmsd2	nmod	Rīga
Rīga		PROPN	npfsn4	nsubj	galvaspilsēta
ir		AUX	vcnipii30an	cop	galvaspilsēta
Latvijas		PROPN	npfsg4	nmod	galvaspilsēta
galvaspilsēta		NOUN	ncfsn4	ROOT	galvaspilsēta
un		CCONJ	cc	cc	viens
viens		NUM	mcsmsn	conj	galvaspilsēta
no		ADP	sppd	case	centriem
galvenajiem		ADJ	armpdyp	amod	centriem
rūpniecības		NOUN	ncfsg4	nmod	centriem
,		PUNCT	zc	punct	darījumu
darījumu		NOUN	ncmpg1	conj	rūpniecības
,		PUNCT	zc	punct	kultūras
kultūras		NOUN	ncfsg4	conj	rūpniecības
,		PUNCT	zc	punct	sporta
sporta		NOUN	ncmsg1	conj	rūpniecības
un		CCONJ	cc	cc	finanšu
finanšu		NOUN	ncfdg5	conj	rūpniecības
centriem		NOUN	ncmpd1	nmod	viens
Baltijas		PROPN	npfsg4	nmod	valstīs
valstīs		NOUN	ncfpl6	nmod	centriem
,		PUNCT	zc	punct	pilsēta
kā		PART	q	cc	pilsēta
arī		CCONJ	cc	fixed	kā
nozīmīga		ADJ	afmsgnp	amod	pilsēta
ostas		NOUN	ncfsg4	nmod	pilsēta
pilsēta		NOUN	ncfsn4	conj	galvaspilsēta
.		PUNCT	zs	punct	iedzīvotājiem
Ar		ADP	sppd	case	iedzīvotājiem
605		NUM	xn	num