### Due to GitHub's file size limit (maximum 100MB per file), the larger RoBERTa-based models are not included in this repository.

Specifically, files under ./models/* and ./packages/* are excluded from version control.

### 由于 GitHub 对单个文件大小的限制（最大 100MB），基于 RoBERTa 的大规模模型未包含在此仓库中。

具体而言，./models/* 与 ./packages/* 路径下的文件未纳入版本控制。

In [1]:
# =========================
# Cell 0：基础导入 & 路径设置
# =========================
from pathlib import Path
import os
import spacy
from spacy.lookups import Lookups
from spacy.tokens import DocBin
from spacy.cli.package import package

# 项目根目录
project_root = Path(".").resolve()

# 目录结构
corpus_dir = project_root / "corpus"
models_dir = project_root / "models"
model_name = "lv_roberta_base"
trained_model_path = models_dir / model_name / "model-best"
final_model_path = models_dir / model_name / "model-lv_roberta"
lookups_path = project_root / "lookups_lv"
package_output_dir = project_root / "packages"
config_path = project_root / "config" / "config_roberta_base.cfg"

# 创建目录
for p in [corpus_dir, models_dir, models_dir / model_name, package_output_dir, lookups_path, project_root / "config"]:
    p.mkdir(parents=True, exist_ok=True)

print("✅ 目录结构准备完成")



✅ 目录结构准备完成


In [2]:
# ===============================
# Cell 1：Conllu 转换为 spaCy 格式
# ===============================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10


[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m


In [3]:
# =============================
# Cell 2：初始化 config
# =============================
!python -m spacy init config ./config/config_roberta_base.cfg \
    --lang lv \
    --pipeline transformer,tagger,morphologizer,parser,senter \
    --optimize efficiency \
    --gpu


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: GPU
- Transformer: bert-base-multilingual-uncased
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_roberta_base.cfg
You can now add your data and train your pipeline:
python -m spacy train config_roberta_base.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [4]:
# ==========================
# Cell 3：修改 config
# ==========================

cfg_text = config_path.read_text(encoding="utf-8")

# 替换训练/验证集路径
cfg_text = cfg_text.replace("train = null", f"train = {corpus_dir}/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", f"dev = {corpus_dir}/lv_lvtb-ud-dev.spacy")

# 替换 transformer 模型
cfg_text = cfg_text.replace("bert-base-multilingual-uncased", "xlm-roberta-base")

# 开启混合精度
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

config_path.write_text(cfg_text, encoding="utf-8")
print("✅ config 已更新训练/验证路径、transformer 模型，并启用混合精度")


✅ config 已更新训练/验证路径、transformer 模型，并启用混合精度


In [5]:
# ================================
# Cell 4：生成 lemma lookup table
# ================================

# 加载训练集
docbin = DocBin().from_disk(corpus_dir / "lv_lvtb-ud-train.spacy")
lemma_dict = {}

for doc in docbin.get_docs(spacy.blank("lv").vocab):
    for token in doc:
        if token.lemma_:
            lemma_dict[token.text.lower()] = token.lemma_

lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_dict)
lookups.to_disk(lookups_path)
print(f"✅ 已生成 lemma lookup table, 路径: {lookups_path}")


  from .autonotebook import tqdm as notebook_tqdm


✅ 已生成 lemma lookup table, 路径: /home/jesse/Projects/myprojs/spaCy_Pipeline/LV_RoBERTa_base/lookups_lv


In [6]:
# =============================
# Cell 5：训练 RoBERTa_base 模型
# =============================
!python -m spacy train ./config/config_roberta_base.cfg \
    --output ./models/lv_roberta_base\
    --paths.train ./corpus/lv_lvtb-ud-train.spacy \
    --paths.dev ./corpus/lv_lvtb-ud-dev.spacy \
    --gpu-id 0


[38;5;4mℹ Saving to output directory: models/lv_roberta_base[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
tokenizer_config.json: 100%|██████████████████| 25.0/25.0 [00:00<00:00, 303kB/s]
config.json: 100%|█████████████████████████████| 615/615 [00:00<00:00, 12.1MB/s]
sentencepiece.bpe.model: 100%|█████████████| 5.07M/5.07M [00:04<00:00, 1.14MB/s]
tokenizer.json: 100%|██████████████████████| 9.10M/9.10M [00:01<00:00, 4.65MB/s]
model.safetensors: 100%|███████████████████| 1.12G/1.12G [01:09<00:00, 16.0MB/s]
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  -------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  

In [7]:
# ========================
# Cell 6: 模型评估
# ========================
!python -m spacy evaluate ./models/lv_roberta_base//model-best ./corpus/lv_lvtb-ud-test.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      92.90
POS      97.94
MORPH    96.18
UAS      91.55
LAS      88.21
SENT P   98.37
SENT R   98.08
SENT F   98.22
SPEED    19781

[1m

                P        R        F
ExtPos      93.18    88.49    90.77
Case        98.29    97.49    97.89
Gender      97.64    96.70    97.17
Number      98.38    97.46    97.92
Person      99.12    98.80    98.96
PronType    98.54    98.09    98.31
Evident     99.24    99.00    99.12
Mood        98.79    98.47    98.63
Polarity    99.46    99.38    99.42
Tense       97.91    97.76    97.83
VerbForm    98.93    98.87    98.90
Voice       99.21    98.98    99.09
Definite    97.96    98.15    98.05
Degree      98.78    98.97    98.88
Poss       100.00   100.00   100.00
NumType     99.42    76.91    86.73
Reflex      99.57    99.25    99.41
Aspect      98.12    98.12    98.12
Foreign     98.10    81.58    89.08
Typo        50.00     1.92     3.70
Abbr        97.87    84.02    90.42

[1m

          

In [8]:
# ==========================================
# Cell 7：添加 Lemmatizer (lookup) 并保存模型
# ==========================================
import spacy
from spacy.lookups import Lookups
from pathlib import Path

trained_model_path = "./models/lv_roberta_base/model-best"
final_model_path = "./models/lv_roberta_base/model_lv_roberta"
lookups_path = "./lookups_lv"

# 1️⃣ 加载训练好的模型
nlp = spacy.load(trained_model_path)

# 2️⃣ 加载 lookups
lookups = Lookups().from_disk(lookups_path)

# 3️⃣ 添加 lemmatizer（lookup 模式）并直接传入 lookups
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode":"lookup"}, last=True)
lemmatizer.lookups = lookups  # ⚡ 注意这里直接赋值属性，而不是在 config 里传

# 4️⃣ 保存最终模型
nlp.to_disk(final_model_path)
print(f"✅ 模型已保存到 {final_model_path}，包含 lemmatizer + lookups")


✅ 模型已保存到 ./models/lv_roberta_base/model_lv_roberta，包含 lemmatizer + lookups


In [9]:
# =======================
# Cell 8：打包模型
# =======================

from spacy.cli.package import package
from pathlib import Path
import os

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# ⚡ 注意这里把字符串路径换成 Path 对象
package(
    input_dir=Path(final_model_path),
    output_dir=Path(package_output_dir),
    name="roberta_base",
    version="1.0.0",
    force=True
)

print(f"✅ 打包完成，发布包在 {package_output_dir} 文件夹里")

[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for sdist...[0m
running egg_info
creating lv_roberta_base.egg-info
writing lv_roberta_base.egg-info/PKG-INFO
writing dependency_links to lv_roberta_base.egg-info/dependency_links.txt
writing entry points to lv_roberta_base.egg-info/entry_points.txt
writing requirements to lv_roberta_base.egg-info/requires.txt
writing top-level names to lv_roberta_base.egg-info/top_level.txt
writing manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
reading manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
[1m* Building sdist...[0m




running sdist
running egg_info
writing lv_roberta_base.egg-info/PKG-INFO
writing dependency_links to lv_roberta_base.egg-info/dependency_links.txt
writing entry points to lv_roberta_base.egg-info/entry_points.txt
writing requirements to lv_roberta_base.egg-info/requires.txt
writing top-level names to lv_roberta_base.egg-info/top_level.txt
reading manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
running check
creating lv_roberta_base-1.0.0
creating lv_roberta_base-1.0.0/lv_roberta_base
creating lv_roberta_base-1.0.0/lv_roberta_base.egg-info
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/lemmatizer/lookups
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/morphologizer
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/parser
creating lv_roberta_base-1.0.0/lv

In [10]:
# =======================
# Cell 9：生成 wheel + sdist
# =======================
import subprocess
from pathlib import Path

package_output_dir = Path("./packages/lv_roberta_base-1.0.0")

# 进入打包目录生成 wheel 和 sdist
subprocess.run(
    ["python", "-m", "build", "--wheel", "--sdist"],
    cwd=str(package_output_dir)
)

print(f"✅ wheel + sdist 已生成，路径: {package_output_dir / 'dist'}")


[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for wheel...[0m
running egg_info
writing lv_roberta_base.egg-info/PKG-INFO
writing dependency_links to lv_roberta_base.egg-info/dependency_links.txt
writing entry points to lv_roberta_base.egg-info/entry_points.txt
writing requirements to lv_roberta_base.egg-info/requires.txt
writing top-level names to lv_roberta_base.egg-info/top_level.txt
reading manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
[1m* Building wheel...[0m




running bdist_wheel
running build
running build_py
creating build/lib/lv_roberta_base
copying lv_roberta_base/__init__.py -> build/lib/lv_roberta_base
creating build/lib/lv_roberta_base/lv_roberta_base-1.0.0
copying lv_roberta_base/lv_roberta_base-1.0.0/meta.json -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0
copying lv_roberta_base/lv_roberta_base-1.0.0/README.md -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0
copying lv_roberta_base/lv_roberta_base-1.0.0/tokenizer -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0
copying lv_roberta_base/lv_roberta_base-1.0.0/config.cfg -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0
creating build/lib/lv_roberta_base/lv_roberta_base-1.0.0/morphologizer
copying lv_roberta_base/lv_roberta_base-1.0.0/morphologizer/cfg -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0/morphologizer
copying lv_roberta_base/lv_roberta_base-1.0.0/morphologizer/model -> build/lib/lv_roberta_base/lv_roberta_base-1.0.0/morphologizer
creating build/lib/lv_roberta_ba



adding 'lv_roberta_base/lv_roberta_base-1.0.0/tagger/model'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/transformer/cfg'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/transformer/model'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/vocab/key2row'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/vocab/lookups.bin'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/vocab/strings.json'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/vocab/vectors'
adding 'lv_roberta_base/lv_roberta_base-1.0.0/vocab/vectors.cfg'
adding 'lv_roberta_base-1.0.0.dist-info/METADATA'
adding 'lv_roberta_base-1.0.0.dist-info/WHEEL'
adding 'lv_roberta_base-1.0.0.dist-info/entry_points.txt'
adding 'lv_roberta_base-1.0.0.dist-info/top_level.txt'
adding 'lv_roberta_base-1.0.0.dist-info/RECORD'
removing build/bdist.linux-x86_64/wheel
[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for sdist...[0m
running egg_i



running sdist
running egg_info
writing lv_roberta_base.egg-info/PKG-INFO
writing dependency_links to lv_roberta_base.egg-info/dependency_links.txt
writing entry points to lv_roberta_base.egg-info/entry_points.txt
writing requirements to lv_roberta_base.egg-info/requires.txt
writing top-level names to lv_roberta_base.egg-info/top_level.txt
reading manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_roberta_base.egg-info/SOURCES.txt'
running check
creating lv_roberta_base-1.0.0
creating lv_roberta_base-1.0.0/lv_roberta_base
creating lv_roberta_base-1.0.0/lv_roberta_base.egg-info
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/lemmatizer/lookups
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/morphologizer
creating lv_roberta_base-1.0.0/lv_roberta_base/lv_roberta_base-1.0.0/parser
creating lv_roberta_base-1.0.0/lv

In [11]:
# ==========================
# Cell 10A: 通过wheel文件安装
# ==========================
import subprocess
import spacy
import pandas as pd


# ⚡ 安装 wheel
subprocess.run([
    "pip", 
    "install", 
    "./packages/lv_roberta_base-1.0.0/dist/lv_roberta_base-1.0.0-py3-none-any.whl"
])

# 加载模型
nlp_xlmr = spacy.load("lv_roberta_base")

print("lv_roberta_base Pipeline组件:", nlp_xlmr.pipe_names)

Processing ./packages/lv_roberta_base-1.0.0/dist/lv_roberta_base-1.0.0-py3-none-any.whl
Installing collected packages: lv-roberta-base
Successfully installed lv-roberta-base-1.0.0
lv_roberta_base Pipeline组件: ['transformer', 'tagger', 'morphologizer', 'parser', 'senter', 'lemmatizer']


In [12]:
# =============================
# Cell 10B: 通过'tar.gz'文件安装
# =============================
import subprocess
import spacy
import pandas as pd

# 安装 '.tar.gz'
subprocess.run(["pip", "install", "./packages/lv_roberta_base-1.0.0/dist/lv_roberta_base-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_roberta_base")

print("lv_roberta_base Pipeline组件:", nlp_xlmr.pipe_names)

Processing ./packages/lv_roberta_base-1.0.0/dist/lv_roberta_base-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_roberta_base
  Building wheel for lv_roberta_base (setup.py): started


[33m  DEPRECATION: Building 'lv_roberta_base' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_roberta_base'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_roberta_base (setup.py): finished with status 'done'
  Created wheel for lv_roberta_base: filename=lv_roberta_base-1.0.0-py3-none-any.whl size=858349422 sha256=05abd247e6d0dded6808d68288acfcf28c4cab8f56300f80e3f522e40806f822
  Stored in directory: /home/jesse/.cache/pip/wheels/e7/b4/65/841da05f79e17598264d0f597ec923cdb73a318a6773aea609
Successfully built lv_roberta_base
Installing collected packages: lv_roberta_base
  Attempting uninstall: lv_roberta_base
    Found existing installation: lv_roberta_base 1.0.0
    Uninstalling lv_roberta_base-1.0.0:
      Successfully uninstalled lv_roberta_base-1.0.0
Successfully installed lv_roberta_base-1.0.0
lv_roberta_base Pipeline组件: ['transformer', 'tagger', 'morphologizer', 'parser', 'senter', 'lemmatizer']


In [13]:
# ========================================
# Cell 11: 测试模型，显示 Lemma，并显示分句
# ========================================
import spacy
import pandas as pd

nlp = spacy.load("lv_roberta_base")

text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm. Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā. Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji. Tomēr par šī vārda izcelsmi precīzu ziņu nav. Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu. Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi. Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’. Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā. Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168. m. ē.) izveidotajā Austrumeiropas kartē Baltijas jūra nosaukta par "Sarmatu jūru" (MARE SARMATICVM). Citos avotos minēts nosaukums OCEANUS SARMATICUS. Sarmati bija sena klejotāju tauta, kura runājusi indoirāņu valodā un līdz 4. gadsimtam dzīvoja Austrumeiropā. Tacits to dēvēja par "Svēbu jūru" (Mare Suebicum). Vēsturē pazīstami arī citu tautu dotie Baltijas jūras nosaukumi: "Varjagu jūra", "Barbaru jūra",[2] "Vendu jūra". Tā kādā 946. gada dokumentā to dēvēja par "Rūgu jūru" (no rūgu vārda cēlies tagadējās Rīgenes salas nosaukums),[3] bet Nestora hronikas ievadā Baltijas jūra nosaukta par Varjagu jūru.[4][5] Štumpfa (Stumpf) Eiropas kartē,[6] kas izdota Cīrihē, 1548. gadā (pārkopēta no vecākas Sebastiana Minstera veidotas kartes), Baltijas jūra tiek saukta par "Vācu jūru" (Das Deutsche Meer). Arī vecākā 1450. gadā Fra Mauro zīmētajā pasaules kartē redzams nosaukums "Ģermāņu jūra" vai pat okeāns, ko parasti piedēvē Ziemeļjūrai, tomēr minētajā kartē tā nepārprotami ir arī Baltijas jūra.[7] Latvijas piekrastes iedzīvotāji mēdza atklāto jūru dēvēt par "Dižjūru", bet Rīgas līci — par "Mazo jūru" (Mazjūru). Viduslaikos Rīgas līci latīniski dēvēja par "Līvu jūru" (Mare Livonicum). Senākajās Eiropas kartēs par jūrām tika saukti arī lielākie Baltijas jūras līči."""

doc = nlp(text)

# 先生成 token 表格
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  # 显示 token 表格

# 再单独显示分句
print("\n✅ 分句结果：")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,Lemma,POS,Dependency,Head
0,Baltijas,Baltijas,PROPN,nmod,jūras
1,jūras,jūra,NOUN,nmod,nosaukums
2,nosaukums,nosaukums,NOUN,nsubj,devis
3,ir,būt,AUX,aux,devis
4,devis,dot,VERB,ROOT,devis
5,nosaukumu,nosaukums,NOUN,obj,devis
6,baltu,balts,NOUN,nmod,valodām
7,valodām,valoda,NOUN,nmod,nosaukumu
8,un,un,CCONJ,cc,valstīm
9,Baltijas,Baltijas,PROPN,nmod,valstīm



✅ 分句结果：
Sentence 1: Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Sentence 2: Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā.
Sentence 3: Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji.
Sentence 4: Tomēr par šī vārda izcelsmi precīzu ziņu nav.
Sentence 5: Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu.
Sentence 6: Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi.
Sentence 7: Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’.
Sentence 8: Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā.
Sentence 9: Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168. m. ē.) izveidotaj

In [14]:
import spacy
import numpy as np

# Load the pipeline
# 加载模型流水线
nlp = spacy.load("lv_roberta_base")

# Example text
# 示例文本
text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā."""

# Process text
# 处理文本
doc = nlp(text)

# ------------------------
# Tokenization / 分词
# ------------------------
print("Tokens / 分词结果:")
print([token.text for token in doc])

# ------------------------
# Lemmatization / 词形还原
# ------------------------
print("Lemmas / 词形还原结果:")
print([token.lemma_ for token in doc])

# ------------------------
# Part-of-Speech Tagging / 词性标注
# ------------------------
print("POS tags / 词性标注:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")

# ------------------------
# Morphological Features / 形态特征
# ------------------------
print("Morphological features / 形态特征:")
for token in doc:
    print(f"{token.text}: {token.morph}")

# ------------------------
# Dependency Parsing / 依存句法分析
# ------------------------
print("Dependency parsing / 依存句法分析:")
for token in doc:
    print(f"{token.text} <--{token.dep_}-- {token.head.text}")

# ------------------------
# Sentence Segmentation / 分句
# ------------------------
print("Sentences / 分句结果:")
for sent in doc.sents:
    print(sent.text)

# ------------------------
# 查看流水线组件
# ------------------------
print("Pipeline components / 流水线组件:")
print(nlp.pipe_names)

# Transformer vectors
# Transformer 向量表示
vectors = np.vstack([token.vector for token in doc])
print("Token vectors shape / Token 向量维度:", vectors.shape)

Tokens / 分词结果:
['Baltijas', 'jūras', 'nosaukums', 'ir', 'devis', 'nosaukumu', 'baltu', 'valodām', 'un', 'Baltijas', 'valstīm', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietoja', 'vācu', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimtā', '.']
Lemmas / 词形还原结果:
['Baltijas', 'jūra', 'nosaukums', 'būt', 'dot', 'nosaukums', 'balts', 'valoda', 'un', 'Baltijas', 'valsts', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietot', 'vāci', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimts', '.']
POS tags / 词性标注:
Baltijas: PROPN (npfsg4)
jūras: NOUN (ncfsg4)
nosaukums: NOUN (ncmsn1)
ir: AUX (vcnipii30an)
devis: VERB (vmnpdmsnasnpn)
nosaukumu: NOUN (ncmsa1)
baltu: NOUN (ncmpg1)
valodām: NOUN (ncfpd4)
un: CCONJ (cc)
Baltijas: PROPN (npfsg4)
valstīm: NOUN (ncfpd6)
.: PUNCT (zs)

: PUNCT (zs)
Terminu: NOUN (ncmsa1)
": PUNCT (zq)
Baltijas: PROPN (npfsg4)
jūra: NOUN (ncfsn4)
": PUNCT (zq)
(: PU