In [2]:
import cupy

print(cupy.show_config())


OS                           : Linux-6.14.0-29-generic-x86_64-with-glibc2.39
Python Version               : 3.12.3
CuPy Version                 : 13.6.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.3.2
SciPy Version                : None
Cython Build Version         : 3.0.12
Cython Runtime Version       : None
CUDA Root                    : /usr/local/cuda-12.8
nvcc PATH                    : /usr/local/cuda-12.8/bin/nvcc
CUDA Build Version           : 12090
CUDA Driver Version          : 12080
CUDA Runtime Version         : 12090 (linked to CuPy) / 12080 (locally installed)
CUDA Extra Include Dirs      : ['/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/nvidia/cuda_runtime/include']
cuBLAS Version               : (available)
cuFFT Version                : 11303
cuRAND Version               : 10309
cuSOLVER Version             : (11, 7, 3)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 8)
Thrust Version    

In [3]:
# Cell 1: 基础检查
import spacy
from pathlib import Path
import os

# 查看 spaCy 版本和可用组件
spacy.info()


{'spacy_version': '3.8.7',
 'location': '/home/jesse/Projects/myenvs/spacy_lv/lib/python3.12/site-packages/spacy',
 'platform': 'Linux-6.14.0-29-generic-x86_64-with-glibc2.39',
 'python_version': '3.12.3',
 'pipelines': {'lv_latvian_model': '1.0.0'}}

In [4]:
# Cell 2: 创建 corpus 和 models 文件夹
Path("./corpus").mkdir(parents=True, exist_ok=True)
Path("./models").mkdir(parents=True, exist_ok=True)


In [5]:
# Cell 3: 转换 conllu 到 spaCy 格式
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus


[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (15055 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2080 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2396 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [6]:
# Cell 4: 初始化 config
!python -m spacy init config config.cfg --lang lv --pipeline tok2vec,tagger,morphologizer,parser,senter --optimize efficiency


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
# Cell 5: 修改 config 数据路径
cfg_text = Path("config.cfg").read_text(encoding="utf-8")
cfg_text = cfg_text.replace("train = null", "train = ./corpus/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", "dev = ./corpus/lv_lvtb-ud-dev.spacy")
Path("config.cfg").write_text(cfg_text, encoding="utf-8")
print("✅ 已经修改 config.cfg 的数据路径")



✅ 已经修改 config.cfg 的数据路径


In [9]:
# Cell 6: 使用 GPU 训练（没有 GPU 就去掉 --gpu-id 0）
!python -m spacy train config.cfg --output ./models --paths.train ./corpus/lv_lvtb-ud-train.spacy --paths.dev ./corpus/lv_lvtb-ud-dev.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  ------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0          0.00        87.49          87.46       278.06        49.00    14.64    25.48      15.06     7.89     6.41     0.04     0.67     0.08    0.10
  0     200       2633.92     13476.89       12727.74     24564.91      1502.43    45.97    73.97      51.92    49.83    30.72    58.85    76.25    66.43    0.54
  0     400       5551.72     12816.16       11668.76     23360.60       156.25    60.77    83.42      

In [10]:
# Cell 7: 模型评估
!python -m spacy evaluate ./models/model-best ./corpus/lv_lvtb-ud-test.spacy


[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      99.53
TAG      86.63
POS      94.78
MORPH    90.65
UAS      80.84
LAS      75.42
SENT P   99.75
SENT R   99.87
SENT F   99.81
SPEED    13945

[1m

               P       R       F
ExtPos     89.15   82.73   85.82
Case       92.07   91.64   91.86
Gender     93.59   93.04   93.32
Number     93.83   92.99   93.41
Person     95.26   94.02   94.64
PronType   98.39   97.86   98.12
Evident    94.74   93.06   93.89
Mood       93.82   92.27   93.04
Polarity   94.41   92.89   93.64
Tense      91.34   89.11   90.21
VerbForm   93.04   91.52   92.28
Voice      93.88   91.69   92.77
Definite   90.77   87.87   89.30
Degree     93.09   90.49   91.77
Poss       99.45   99.45   99.45
NumType    99.03   76.61   86.39
Reflex     95.02   92.37   93.68
Aspect     89.14   85.71   87.39
Foreign    80.37   45.26   57.91
Typo        0.00    0.00    0.00
Abbr       92.11   79.91   85.57

[1m

                 

In [25]:
# Cell 8: 加入 Stanza lemmatizer 并保存
import spacy
import latvian_stanza_model  # ✅ 注册自定义 factory

# 加载训练好的模型
nlp = spacy.load("./models/model-best")

# 添加 Stanza Lemmatizer
nlp.add_pipe("stanza_lemmatizer", last=True)

# 保存新模型
nlp.to_disk("./models/model-with-stanza-lemma")
print("✅ 模型已保存到 ./models/model-with-stanza-lemma")


✅ 模型已保存到 ./models/model-with-stanza-lemma


In [27]:
# Cell 8.5: Notebook 打包修正版
from spacy.cli.package import package
import os
from pathlib import Path

# 创建 packages 文件夹
Path("./packages").mkdir(parents=True, exist_ok=True)

# 确保 Python 能找到自定义 factory
project_root = "/home/jesse/Projects/myprojs/Spacy_Latvian"
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# 转换为 Path 对象
input_dir = Path("./models/model-with-stanza-lemma")
output_dir = Path("./packages")

# 调用 spacy.cli.package
package(
    input_dir=input_dir,
    output_dir=output_dir,
    name="latvian_model",
    version="1.0.0",
    force=True
)

print("✅ 打包完成，可以在 packages 文件夹里找到")


[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for sdist...[0m
running egg_info
creating lv_latvian_model.egg-info
writing lv_latvian_model.egg-info/PKG-INFO
writing dependency_links to lv_latvian_model.egg-info/dependency_links.txt
writing entry points to lv_latvian_model.egg-info/entry_points.txt
writing top-level names to lv_latvian_model.egg-info/top_level.txt
writing manifest file 'lv_latvian_model.egg-info/SOURCES.txt'
reading manifest file 'lv_latvian_model.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_latvian_model.egg-info/SOURCES.txt'
[1m* Building sdist...[0m




running sdist
running egg_info
writing lv_latvian_model.egg-info/PKG-INFO
writing dependency_links to lv_latvian_model.egg-info/dependency_links.txt
writing entry points to lv_latvian_model.egg-info/entry_points.txt
writing top-level names to lv_latvian_model.egg-info/top_level.txt
reading manifest file 'lv_latvian_model.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_latvian_model.egg-info/SOURCES.txt'
running check
creating lv_latvian_model-1.0.0
creating lv_latvian_model-1.0.0/lv_latvian_model
creating lv_latvian_model-1.0.0/lv_latvian_model.egg-info
creating lv_latvian_model-1.0.0/lv_latvian_model/lv_latvian_model-1.0.0
creating lv_latvian_model-1.0.0/lv_latvian_model/lv_latvian_model-1.0.0/morphologizer
creating lv_latvian_model-1.0.0/lv_latvian_model/lv_latvian_model-1.0.0/parser
creating lv_latvian_model-1.0.0/lv_latvian_model/lv_latvian_model-1.0.0/senter
creating lv_latvian_model-1.0.0/lv_latvian_model/lv_latvian_model-1.0.0/tagger
creat

In [28]:
# Cell 9: 安装并测试打包好的拉脱维亚语模型
import subprocess
import spacy

# 安装打包好的模型
package_path = "./packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz"

# 使用 subprocess 调用 pip 安装（Notebook 内直接运行也可以）
subprocess.run(["pip", "install", package_path])



Processing ./packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: lv_latvian_model
  Building wheel for lv_latvian_model (pyproject.toml): started
  Building wheel for lv_latvian_model (pyproject.toml): finished with status 'done'
  Created wheel for lv_latvian_model: filename=lv_latvian_model-1.0.0-py3-none-any.whl size=8773269 sha256=0cd66d637ec3b611379408ce6a1084d89d6e293bcb80b0a90502fdc3804d63ff
  Stored in directory: /home/jesse/.cache/pip/wheels/7a/9e/71/1e57930844b68e7ad0ecd8336bccc8e0cf32783841865bc649
Successfully built lv_latvian_model
Installing collected packages: lv_latvian_model
  Attempting uni

CompletedProcess(args=['pip', 'install', './packages/lv_latvian_model-1.0.0/dist/lv_latvian_model-1.0.0.tar.gz'], returncode=0)

In [30]:
import spacy

# 加载模型
nlp = spacy.load("lv_latvian_model")

# 查看pipeline组件
print("Pipeline组件：", nlp.pipe_names)
print("组件数量：", len(nlp.pipe_names))


Pipeline组件： ['tok2vec', 'tagger', 'morphologizer', 'parser', 'senter', 'stanza_lemmatizer']
组件数量： 6


In [40]:
# 加载模型
# 安装后，包名是 lv_latvian_model
nlp = spacy.load("lv_latvian_model")

# 测试文本
# 你可以用 wiki 或者任何拉脱维亚语文本，这里举个示例
text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 输出分析结果
print("TOKEN\tLEMMA\tPOS\tTAG\tDEP\tHEAD")
for token in doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.tag_}\t{token.dep_}\t{token.head.text}")

# 句子划分
print("\n句子划分:")
for sent in doc.sents:
    print(sent.text)


TOKEN	LEMMA	POS	TAG	DEP	HEAD
Rīga	Rīga	PROPN	npfsn4	nsubj	galvaspilsēta
ir	būt	AUX	vcnipii30an	cop	galvaspilsēta
Latvijas	Latvija	PROPN	npfsg4	nmod	galvaspilsēta
galvaspilsēta	galvaspilsēta	NOUN	ncfsn4	ROOT	galvaspilsēta
un	un	CCONJ	cc	cc	viens
viens	viens	NUM	mcsmsn	conj	galvaspilsēta
no	no	ADP	sppd	case	rūpniecības
galvenajiem	galvenais	ADJ	armpdyp	amod	rūpniecības
rūpniecības	rūpniecība	NOUN	ncfsg4	nmod	centriem
,	,	PUNCT	zc	punct	darījumu
darījumu	darījums	NOUN	ncmsa1	conj	rūpniecības
,	,	PUNCT	zc	punct	kultūras
kultūras	kultūra	NOUN	ncfsg4	conj	rūpniecības
,	,	PUNCT	zc	punct	sporta
sporta	sports	NOUN	ncmsg1	conj	rūpniecības
un	un	CCONJ	cc	cc	finanšu
finanšu	finanses	NOUN	ncfdg5	conj	rūpniecības
centriem	centrs	NOUN	ncmpd1	nmod	viens
Baltijas	Baltija	PROPN	npfsg4	nmod	valstīs
valstīs	valsts	NOUN	ncfpl6	nmod	centriem
,	,	PUNCT	zc	punct	pilsēta
kā	kā	PART	q	cc	pilsēta
arī	arī	CCONJ	cc	fixed	kā
nozīmīga	nozīmīgs	ADJ	affsnnp	amod	pilsēta
ostas	osta	NOUN	ncfsg4	nmod	pilsēta
pilsēta	pils

In [39]:
import spacy
import pandas as pd

# 加载你打包好的拉脱维亚语模型
nlp = spacy.load("lv_latvian_model")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā."""

# 处理文本
doc = nlp(text)

# 构建表格数据
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            # "Sentence": sent.text.strip(),
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

# 转成 DataFrame
df = pd.DataFrame(rows)

# 显示表格
pd.set_option("display.max_rows", None)  # 显示所有行
pd.set_option("display.max_colwidth", None)  # 列内容不截断
df


Unnamed: 0,Text,Lemma,POS,Dependency,Head
0,Rīga,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,būt,AUX,cop,galvaspilsēta
2,Latvijas,Latvija,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,un,CCONJ,cc,viens
5,viens,viens,NUM,conj,galvaspilsēta
6,no,no,ADP,case,rūpniecības
7,galvenajiem,galvenais,ADJ,amod,rūpniecības
8,rūpniecības,rūpniecība,NOUN,nmod,centriem
9,",",",",PUNCT,punct,darījumu


In [42]:
# Cell: Pipeline 全部组件测试（可读性表格输出）
import spacy
import pandas as pd

# 假设你的模型已经加载好了
# nlp = spacy.load("lv_latvian_model-1.0.0")
text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē."""

doc = nlp(text)

# 收集每个 token 的信息
data = []
for token in doc:
    sent_text = token.sent.text  # 当前 token 所在句子
    data.append({
        "Text": token.text,
        "Lemma": token.lemma_,
        "POS": token.pos_,
        "Morph": token.morph,
        "Dependency": token.dep_,
        "Head": token.head.text,
        "Sentence": sent_text
    })

# 转成 DataFrame，方便阅读
df = pd.DataFrame(data)

# 输出前 50 行示例，可视化效果好
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)
print(df)


             Text          Lemma    POS  \
0            Rīga           Rīga  PROPN   
1              ir            būt    AUX   
2        Latvijas        Latvija  PROPN   
3   galvaspilsēta  galvaspilsēta   NOUN   
4              un             un  CCONJ   
5           viens          viens    NUM   
6              no             no    ADP   
7     galvenajiem      galvenais    ADJ   
8     rūpniecības     rūpniecība   NOUN   
9               ,              ,  PUNCT   
10       darījumu       darījums   NOUN   
11              ,              ,  PUNCT   
12       kultūras        kultūra   NOUN   
13              ,              ,  PUNCT   
14         sporta         sports   NOUN   
15             un             un  CCONJ   
16        finanšu       finanses   NOUN   
17       centriem         centrs   NOUN   
18       Baltijas        Baltija  PROPN   
19        valstīs         valsts   NOUN   
20              ,              ,  PUNCT   
21             kā             kā   PART   
22         