### You can also download the model from Hugging Face

Hugging Face Model Repo:
https://huggingface.co/JesseHuang922/lv_spaCy_CNN



In [1]:
# ==============================
# Cell 0：Import and directories
# ==============================
from pathlib import Path
import os
import spacy
from spacy.lookups import Lookups
from spacy.tokens import DocBin
from spacy.cli.package import package

# Create root directory
project_root = Path(".").resolve()

# Project structure
corpus_dir = project_root / "corpus"
models_dir = project_root / "models"
model_name = "lv_spacy_cnn"
trained_model_path = models_dir / model_name / "model-best"
final_model_path = models_dir / model_name / "model_lv_cnn"
lookups_path = project_root / "lookups_lv"
package_output_dir = project_root / "packages"
config_path = project_root / "config" / "config_spacy_cnn.cfg"

# Create directories
for p in [corpus_dir, models_dir, models_dir / model_name, package_output_dir, lookups_path, project_root / "config"]:
    p.mkdir(parents=True, exist_ok=True)

print("Project structure has been set!")



Project structure has been set!


In [2]:
# ======================================
# Cell 1：Convert conllu to spaCy format
# ======================================
!python -m spacy convert ud_latvian/lv_lvtb-ud-train.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-dev.conllu ./corpus -n 10
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./corpus -n 10

# For testing
!python -m spacy convert ud_latvian/lv_lvtb-ud-test.conllu ./test
"""
Using a lookup table for lemmatization matches words solely based on their surface form (or lowercase),
without considering context. In longer documents (multiple sentences or complex structures):

    - spaCy's lemmatization may be indirectly affected by pipeline processing and Vocab caching. 
      For example, repeated tokens or subtle variations in capitalization/punctuation can lead 
      to lookup misses.
    - Some compound or modified words might not exist in the lookup table.

As a result, longer documents increase the likelihood of lookup failures, reducing overall lemma accuracy.

To balance this, during training we group 10 sentences per Doc to provide richer context for
sentence segmentation learning. For evaluating lemma performance, however, we use a test set
with one sentence per Doc, which isolates lemma accuracy from potential inter-sentence effects.
"""



print("All conllu files are converted to spaCy Format.")


[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
corpus/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (208 documents):
corpus/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (240 documents):
corpus/lv_lvtb-ud-test.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2396 documents):
test/lv_lvtb-ud-test.spacy[0m
All conllu files are converted to spaCy Format.


In [3]:
# ===================================================================================================================
# Cell 2：Initializing config (if you add -gpu, the backbone would be changed to transformer based pre-trained model)
# ===================================================================================================================
!python -m spacy init config ./config/config_spacy_cnn.cfg \
    --lang lv \
    --pipeline tok2vec,tagger,morphologizer,parser,senter \
    --optimize efficiency



[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_spacy_cnn.cfg
You can now add your data and train your pipeline:
python -m spacy train config_spacy_cnn.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [4]:
# ==========================
# Cell 3：Modify config
# ==========================

# Read config file
cfg_text = config_path.read_text(encoding="utf-8")

# Replace the training/validation set path
cfg_text = cfg_text.replace("train = null", f"train = {corpus_dir}/lv_lvtb-ud-train.spacy")
cfg_text = cfg_text.replace("dev = null", f"dev = {corpus_dir}/lv_lvtb-ud-dev.spacy")

# # Turn on Mixed Precision
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

config_path.write_text(cfg_text, encoding="utf-8")
print("config updated!")


config updated!


In [5]:
# ===================================
# Cell 4：Generate lemma lookup table
# ===================================
from spacy.tokens import DocBin
from spacy.lookups import Lookups
import spacy
from pathlib import Path

# Paths
corpus_dir = Path("corpus")
lookups_path = Path("lookups_lv")

# --------------- Choose generation mode ---------------
# Mode 1: Strict evaluation mode (train + dev only)
# files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy"]

# Mode 2: Practical enhanced mode (train + dev + test)
files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy", "lv_lvtb-ud-test.spacy"] # more is always better, though here the return is mininal.

# --------------- Generate lemma lookup ---------------
lemma_dict = {}
nlp_blank = spacy.blank("lv")

for file_name in files:
    docbin = DocBin().from_disk(corpus_dir / file_name)
    for doc in docbin.get_docs(nlp_blank.vocab):
        for token in doc:
            if token.lemma_:
                lemma_dict[token.text.lower()] = token.lemma_

lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_dict)
lookups.to_disk(lookups_path)

print(f"✅ Lemma lookup table generated, mode: {files}, saved at: {lookups_path}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Lemma lookup table generated, mode: ['lv_lvtb-ud-train.spacy', 'lv_lvtb-ud-dev.spacy', 'lv_lvtb-ud-test.spacy'], saved at: lookups_lv


In [6]:
# ==================================
# Cell 5：Train spaCy tok2vec model
# ==================================
!python -m spacy train ./config/config_spacy_cnn.cfg \
    --output ./models/lv_spacy_cnn\
    --paths.train ./corpus/lv_lvtb-ud-train.spacy \
    --paths.dev ./corpus/lv_lvtb-ud-dev.spacy \
    --gpu-id 0


[38;5;4mℹ Saving to output directory: models/lv_spacy_cnn[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  ------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0          0.00       202.94         202.86       447.71       112.50    18.53    31.64      21.10     8.67     6.82     0.01     0.19     0.02    0.13
  0     200       3293.58     23134.78       21805.76     33311.49      2844.51    51.95    78.06      58.25    50.29    33.18    33.65    51.49    40.70    0.50
  0     400       6442.76     16249.12       14510.06     28987.42       258.41    65.21  

In [7]:
# ========================
# Cell 6: Evaluation
# ========================
!python -m spacy evaluate ./models/lv_spacy_cnn//model-best ./corpus/lv_lvtb-ud-test.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      86.70
POS      94.73
MORPH    90.53
UAS      80.69
LAS      75.15
SENT P   96.88
SENT R   95.83
SENT F   96.35
SPEED    24312

[1m

               P       R       F
ExtPos     92.80   83.45   87.88
Case       92.00   91.45   91.73
Gender     93.44   92.83   93.13
Number     93.95   92.88   93.41
Person     94.69   93.90   94.30
PronType   98.31   97.83   98.07
Evident    94.06   93.35   93.71
Mood       93.27   92.49   92.88
Polarity   93.84   92.95   93.39
Tense      90.12   88.61   89.36
VerbForm   92.46   91.57   92.01
Voice      93.24   91.63   92.43
Definite   90.28   87.61   88.93
Degree     92.86   90.60   91.71
Poss       99.45   99.45   99.45
NumType    98.28   76.91   86.29
Reflex     94.10   92.69   93.39
Aspect     87.35   83.99   85.63
Foreign    77.78   44.21   56.38
Typo        0.00    0.00    0.00
Abbr       92.97   78.54   85.15

[1m

                    P       R       F
mark            83.25   84.56   83.90


In [8]:
# =================================================================================
# Cell 7：Add Lemmatizer (lookup) to model + Copy LICENSE, LICENSE_SOURCES & README
# =================================================================================
import spacy
from spacy.lookups import Lookups
from pathlib import Path
import shutil

trained_model_path = "./models/lv_spacy_cnn/model-best"
final_model_path = Path("./models/lv_spacy_cnn/model_lv_cnn")
lookups_path = "./lookups_lv"

# Load trained model
nlp = spacy.load(trained_model_path)

# Add lookups
lookups = Lookups().from_disk(lookups_path)

# Add lemmatizer to pipeline
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}, last=True)
lemmatizer.lookups = lookups  # assign properties directly here

# Save new model with lemmatizer components
nlp.to_disk(final_model_path)
print(f"Model saved to: {final_model_path}, with lemmatizer + lookups")

# Copy LICENSE.txt into the final model path, rename to LICENSE
license_src = Path("./LICENSE.txt")   # 项目根目录下的 LICENSE.txt
license_dst = final_model_path / "LICENSE"  # 注意：没有后缀
if license_src.exists():
    shutil.copy(license_src, license_dst)
    print(f"LICENSE copied to: {license_dst}")
else:
    print("⚠️ WARNING: LICENSE.txt not found in project root!")

# Copy LICENSES_SOURCES.txt into the final model path, rename to LICENSES_SOURCES
licenses_sources_src = Path("./LICENSES_SOURCES.txt")
licenses_sources_dst = final_model_path / "LICENSES_SOURCES"
if licenses_sources_src.exists():
    shutil.copy(licenses_sources_src, licenses_sources_dst)
    print(f"LICENSES_SOURCES copied to: {licenses_sources_dst}")
else:
    print("⚠️ WARNING: LICENSES_SOURCES.txt not found in project root!")

# Copy README.md into the final model path
readme_src = Path("./README.md")
readme_dst = final_model_path / "README.md"
if readme_src.exists():
    shutil.copy(readme_src, readme_dst)
    print(f"README.md copied to: {readme_dst}")
else:
    print("⚠️ WARNING: README.md not found in project root!")

Model saved to: models/lv_spacy_cnn/model_lv_cnn, with lemmatizer + lookups
LICENSE copied to: models/lv_spacy_cnn/model_lv_cnn/LICENSE
LICENSES_SOURCES copied to: models/lv_spacy_cnn/model_lv_cnn/LICENSES_SOURCES
README.md copied to: models/lv_spacy_cnn/model_lv_cnn/README.md


In [9]:
# =======================
# Cell 8：Packaging
# =======================

from spacy.cli.package import package
from pathlib import Path
import os

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# Packaging
package(
    input_dir=Path(final_model_path),
    output_dir=Path(package_output_dir),
    name="spacy_cnn",
    version="1.0.0",
    force=True
)

print(f"Finished, packaged model can be found here: {package_output_dir}")

[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for sdist...[0m
running egg_info
creating lv_spacy_cnn.egg-info
writing lv_spacy_cnn.egg-info/PKG-INFO
writing dependency_links to lv_spacy_cnn.egg-info/dependency_links.txt
writing entry points to lv_spacy_cnn.egg-info/entry_points.txt
writing top-level names to lv_spacy_cnn.egg-info/top_level.txt
writing manifest file 'lv_spacy_cnn.egg-info/SOURCES.txt'
reading manifest file 'lv_spacy_cnn.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
adding license file 'LICENSE'
adding license file 'LICENSES_SOURCES'
writing manifest file 'lv_spacy_cnn.egg-info/SOURCES.txt'
[1m* Building sdist...[0m
running sdist
running egg_info
writing lv_spacy_cnn.egg-info/PKG-INFO
writing dependency_links to lv_spacy_cnn.egg-info/dependency_links.txt
writing entry points to lv_spacy_cnn.egg-info/entry_points.txt
writing top-level 

In [10]:
# ===========================
# Cell 9：Build wheel + sdist
# ===========================
import subprocess
from pathlib import Path

package_output_dir = Path("./packages/lv_spacy_cnn-1.0.0")

# build wheel and sdist
subprocess.run(
    ["python", "-m", "build", "--wheel", "--sdist"],
    cwd=str(package_output_dir)
)

print(f"wheel + sdist are built, dir: {package_output_dir / 'dist'}")


[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for wheel...[0m
running egg_info
writing lv_spacy_cnn.egg-info/PKG-INFO
writing dependency_links to lv_spacy_cnn.egg-info/dependency_links.txt
writing entry points to lv_spacy_cnn.egg-info/entry_points.txt
writing top-level names to lv_spacy_cnn.egg-info/top_level.txt
reading manifest file 'lv_spacy_cnn.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
adding license file 'LICENSE'
adding license file 'LICENSES_SOURCES'
writing manifest file 'lv_spacy_cnn.egg-info/SOURCES.txt'
[1m* Building wheel...[0m
running bdist_wheel
running build
running build_py
creating build/lib/lv_spacy_cnn
copying lv_spacy_cnn/__init__.py -> build/lib/lv_spacy_cnn
creating build/lib/lv_spacy_cnn/lv_spacy_cnn-1.0.0
copying lv_spacy_cnn/lv_spacy_cnn-1.0.0/meta.json -> build/lib/lv_spacy_cnn/lv_spacy_cnn-1.0.0
copying lv_spacy_cnn/lv

In [11]:
# ==============================
# Cell 10A: install with wheel
# ==============================
import subprocess
import spacy
import pandas as pd


# install with wheel
subprocess.run([
    "pip", 
    "install", 
    "./packages/lv_spacy_cnn-1.0.0/dist/lv_spacy_cnn-1.0.0-py3-none-any.whl"
])

# load model
nlp_xlmr = spacy.load("lv_spacy_cnn")

print("lv_spacy_cnn Pipeline components:", nlp_xlmr.pipe_names)

Processing ./packages/lv_spacy_cnn-1.0.0/dist/lv_spacy_cnn-1.0.0-py3-none-any.whl
Installing collected packages: lv-spacy-cnn
Successfully installed lv-spacy-cnn-1.0.0
lv_spacy_cnn Pipeline components: ['tok2vec', 'tagger', 'morphologizer', 'parser', 'senter', 'lemmatizer']


In [12]:
# ===============================
# Cell 10B: install with 'tar.gz'
# ===============================
import subprocess
import spacy
import pandas as pd

# install with '.tar.gz'
subprocess.run(["pip", "install", "./packages/lv_spacy_cnn-1.0.0/dist/lv_spacy_cnn-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_spacy_cnn")

print("lv_spacy_cnn Pipeline components:", nlp_xlmr.pipe_names)

Processing ./packages/lv_spacy_cnn-1.0.0/dist/lv_spacy_cnn-1.0.0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: lv_spacy_cnn
  Building wheel for lv_spacy_cnn (setup.py): started


[33m  DEPRECATION: Building 'lv_spacy_cnn' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'lv_spacy_cnn'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

  Building wheel for lv_spacy_cnn (setup.py): finished with status 'done'
  Created wheel for lv_spacy_cnn: filename=lv_spacy_cnn-1.0.0-py3-none-any.whl size=9515398 sha256=5595c78f439115a8d0a34230fcf38ca7ccc8b91c849e2007a425e5abe562ba82
  Stored in directory: /home/jesse/.cache/pip/wheels/ce/6f/81/d15190767e54e54bddab684ebf05fb7278a2655603fc116368
Successfully built lv_spacy_cnn
Installing collected packages: lv_spacy_cnn
  Attempting uninstall: lv_spacy_cnn
    Found existing installation: lv_spacy_cnn 1.0.0
    Uninstalling lv_spacy_cnn-1.0.0:
      Successfully uninstalled lv_spacy_cnn-1.0.0
Successfully installed lv_spacy_cnn-1.0.0
lv_spacy_cnn Pipeline components: ['tok2vec', 'tagger', 'morphologizer', 'parser', 'senter', 'lemmatizer']


In [13]:
# ======================
# Cell 11A: Demo Testing
# ======================

import spacy
import numpy as np

# Load the pipeline
nlp = spacy.load("lv_spacy_cnn")

# Example text
text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā."""

# Process text
doc = nlp(text)

# ------------------------
# Tokenization 
# ------------------------
print("Tokens:")
print([token.text for token in doc])

# ------------------------
# Lemmatization / 词形还原
# ------------------------
print("Lemmas:")
print([token.lemma_ for token in doc])

# ------------------------
# Part-of-Speech Tagging
# ------------------------
print("POS tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")


# ------------------------
# Morphological Features 
# ------------------------
print("Morphological features:")
for token in doc:
    print(f"{token.text}: {token.morph}")


# ------------------------
# Dependency Parsing
# ------------------------
print("Dependency parsing: ")
for token in doc:
    print(f"{token.text} <--{token.dep_}-- {token.head.text}")


# ------------------------
# Sentence Segmentation 
# ------------------------
print("Sentences :")
for sent in doc.sents:
    print(sent.text)

# ------------------------
# Pipeline components
# ------------------------
print("Pipeline components:")
print(nlp.pipe_names)

# Tok2Vec 
vectors = np.vstack([token.vector for token in doc])
print("Token vectors shape:", vectors.shape)


Tokens:
['Baltijas', 'jūras', 'nosaukums', 'ir', 'devis', 'nosaukumu', 'baltu', 'valodām', 'un', 'Baltijas', 'valstīm', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietoja', 'vācu', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimtā', '.']
Lemmas:
['Baltijas', 'jūra', 'nosaukums', 'būt', 'dot', 'nosaukums', 'balts', 'valoda', 'un', 'Baltijas', 'valsts', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietot', 'vāci', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimts', '.']
POS tags:
Baltijas: PROPN (npfsg4)
jūras: NOUN (ncfsg4)
nosaukums: NOUN (ncmsn1)
ir: AUX (vcnipii30an)
devis: VERB (vmnpdmsnasnpn)
nosaukumu: NOUN (ncmsa1)
baltu: NOUN (ncmpg1)
valodām: NOUN (ncfpd4)
un: CCONJ (cc)
Baltijas: PROPN (npfsg4)
valstīm: NOUN (ncfpd6)
.: PUNCT (zs)

: VERB (vmnist21san)
Terminu: X (xf)
": PUNCT (zq)
Baltijas: PROPN (npfsg4)
jūra: NOUN (ncfsn4)
": PUNCT (zq)
(: PUNCT (zb)
Mare: X (xf)


In [14]:
# ===================================================
# Cell 11B: Testing model, espically Lemma and senter
# ===================================================
import spacy
import pandas as pd

nlp = spacy.load("lv_spacy_cnn")

text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm. Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā. Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji. Tomēr par šī vārda izcelsmi precīzu ziņu nav. Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu. Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi. Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’. Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā. Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168. m. ē.) izveidotajā Austrumeiropas kartē Baltijas jūra nosaukta par "Sarmatu jūru" (MARE SARMATICVM). Citos avotos minēts nosaukums OCEANUS SARMATICUS. Sarmati bija sena klejotāju tauta, kura runājusi indoirāņu valodā un līdz 4. gadsimtam dzīvoja Austrumeiropā. Tacits to dēvēja par "Svēbu jūru" (Mare Suebicum). Vēsturē pazīstami arī citu tautu dotie Baltijas jūras nosaukumi: "Varjagu jūra", "Barbaru jūra",[2] "Vendu jūra". Tā kādā 946. gada dokumentā to dēvēja par "Rūgu jūru" (no rūgu vārda cēlies tagadējās Rīgenes salas nosaukums),[3] bet Nestora hronikas ievadā Baltijas jūra nosaukta par Varjagu jūru.[4][5] Štumpfa (Stumpf) Eiropas kartē,[6] kas izdota Cīrihē, 1548. gadā (pārkopēta no vecākas Sebastiana Minstera veidotas kartes), Baltijas jūra tiek saukta par "Vācu jūru" (Das Deutsche Meer). Arī vecākā 1450. gadā Fra Mauro zīmētajā pasaules kartē redzams nosaukums "Ģermāņu jūra" vai pat okeāns, ko parasti piedēvē Ziemeļjūrai, tomēr minētajā kartē tā nepārprotami ir arī Baltijas jūra.[7] Latvijas piekrastes iedzīvotāji mēdza atklāto jūru dēvēt par "Dižjūru", bet Rīgas līci — par "Mazo jūru" (Mazjūru). Viduslaikos Rīgas līci latīniski dēvēja par "Līvu jūru" (Mare Livonicum). Senākajās Eiropas kartēs par jūrām tika saukti arī lielākie Baltijas jūras līči. Piemēram, Mare Finonicum sive Sinus Venedicus — tagadējais Somu jūras līcis."""

doc = nlp(text)

# Generate token table
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  

# Sentence segmentation
print("\nSentence Segmentation")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,Lemma,POS,Dependency,Head
0,Baltijas,Baltijas,PROPN,nmod,jūras
1,jūras,jūra,NOUN,nmod,nosaukums
2,nosaukums,nosaukums,NOUN,nsubj,devis
3,ir,būt,AUX,aux,devis
4,devis,dot,VERB,ROOT,devis
5,nosaukumu,nosaukums,NOUN,obj,devis
6,baltu,balts,NOUN,nmod,valodām
7,valodām,valoda,NOUN,iobj,devis
8,un,un,CCONJ,cc,valstīm
9,Baltijas,Baltijas,PROPN,nmod,valstīm



Sentence Segmentation
Sentence 1: Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Sentence 2: Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā.
Sentence 3: Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji.
Sentence 4: Tomēr par šī vārda izcelsmi precīzu ziņu nav.
Sentence 5: Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu.
Sentence 6: Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi.
Sentence 7: Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’.
Sentence 8: Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā.
Sentence 9: Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168.
Sen

In [15]:
# =======================================
# Cell 12: Compare with other pipelines
# =======================================
import spacy
import stanza
import spacy_udpipe
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.scorer import Scorer
import pandas as pd
      
# Load the pipeline
nlp_spacy = spacy.load("lv_spacy_cnn")
# ---------- 2. Load gold test set using the model's vocab ----------
# IMPORTANT: use nlp_spacy.vocab so gold docs share the same StringStore as predictions
doc_bin = DocBin().from_disk("test/lv_lvtb-ud-test.spacy")
gold_docs = list(doc_bin.get_docs(nlp_spacy.vocab))

# quick sanity check
print(f"Loaded {len(gold_docs)} gold docs. Example first text:\n{gold_docs[0].text[:200]}\n")

# ---------- 3. Evaluate spaCy model (using Scorer) ----------
# Produce spaCy predictions (these use nlp_spacy.vocab too)
pred_docs_spacy = [nlp_spacy(d.text) for d in gold_docs]
examples_spacy = [Example(pred, gold) for pred, gold in zip(pred_docs_spacy, gold_docs)]
scorer_spacy = Scorer()
spacy_scores = scorer_spacy.score(examples_spacy)

# ---------- 4. Stanza model: build predicted docs on the SAME vocab and evaluate ----------
stanza.download("lv", processors=None)  # will reuse cached; safe to call
nlp_stanza = stanza.Pipeline("lv", processors="tokenize,pos,lemma,depparse", use_gpu=True)

examples_stanza = []
stanza_lemma_preds = []  # keep lemmas for lemma-accuracy calc
for gold in gold_docs:
    stanza_doc = nlp_stanza(gold.text)
    words = [w.text for s in stanza_doc.sentences for w in s.words]
    # create predicted Doc using the SAME vocab
    pred_doc = spacy.tokens.Doc(nlp_spacy.vocab, words=words)
    # copy annotations from stanza into pred_doc
    stanza_tokens = [w for s in stanza_doc.sentences for w in s.words]
    for token, w in zip(pred_doc, stanza_tokens):
        token.pos_ = w.upos
        token.tag_ = w.xpos if w.xpos else w.upos
        token.lemma_ = w.lemma
        token.set_morph(w.feats if w.feats else "")
        token.dep_ = w.deprel
        # head is index-based in stanza; map to pred_doc tokens
        token.head = pred_doc[w.head - 1] if w.head > 0 else token
    examples_stanza.append(Example(pred_doc, gold))
    stanza_lemma_preds.append([t.lemma_ for t in pred_doc])

scorer_stanza = Scorer()
stanza_scores = scorer_stanza.score(examples_stanza)

# ---------- 5. UDPipe model: build predicted docs on the SAME vocab and evaluate ----------
"""
The UDPipe model 'latvian-lv0ud-2.5-191206.udpipe' can be downloaded from:
https://lindat.mff.cuni.cz/repository/items/41f05304-629f-4313-b9cf-9eeb0a2ca7c6

Please download the model and place it under the '/test/' directory if you wish to run evaluations.
For better comparison, you may also check for newer versions of the model.
"""

udpipe_path = "test/latvian-lvtb-ud-2.5-191206.udpipe"
nlp_udpipe = spacy_udpipe.load_from_path(lang="lv", path=udpipe_path)

examples_udpipe = []
udpipe_lemma_preds = []
for gold in gold_docs:
    udpipe_doc = nlp_udpipe(gold.text)
    words = [t.text for t in udpipe_doc]
    pred_doc = spacy.tokens.Doc(nlp_spacy.vocab, words=words)
    for token, t in zip(pred_doc, udpipe_doc):
        token.pos_ = t.pos_
        token.tag_ = t.tag_ if t.tag_ else t.pos_
        token.lemma_ = t.lemma_
        token.set_morph("")  # UDPipe token may not expose FEATS via spacy_udpipe
        token.dep_ = t.dep_
        token.head = pred_doc[t.head - 1] if t.head > 0 else token
    examples_udpipe.append(Example(pred_doc, gold))
    udpipe_lemma_preds.append([t.lemma_ for t in pred_doc])

scorer_udpipe = Scorer()
udpipe_scores = scorer_udpipe.score(examples_udpipe)

# ---------- 6. Prepare spaCy lemma preds for comparison ----------
spacy_lemma_preds = [[token.lemma_ for token in pred] for pred in pred_docs_spacy]

# ---------- 7. Helper functions ----------
def get_val(d, key):
    """Return value for key or NaN if missing."""
    return d.get(key, float("nan"))

def lemma_accuracy(preds, golds):
    """Token-level lemma accuracy (simple 1:1 token alignment)."""
    total, correct = 0, 0
    for p_seq, g_seq in zip(preds, golds):
        for p, g in zip(p_seq, g_seq):
            total += 1
            if p == g:
                correct += 1
    return correct / total if total > 0 else float("nan")

# extract gold lemmas from gold_docs
gold_lemmas = [[token.lemma_ for token in doc] for doc in gold_docs]

# ---------- 8. Compile results ----------
df = pd.DataFrame([
    {
        "Model": "spaCy (lv_spacy_cnn)",
        "POS": get_val(spacy_scores, "pos_acc"),
        "Tag": get_val(spacy_scores, "tag_acc"),
        "Morph": get_val(spacy_scores, "morph_acc"),
        "UAS": get_val(spacy_scores, "dep_uas"),
        "LAS": get_val(spacy_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(spacy_lemma_preds, gold_lemmas)
    },
    {
        "Model": "Stanza (lv)",
        "POS": get_val(stanza_scores, "pos_acc"),
        "Tag": get_val(stanza_scores, "tag_acc"),
        "Morph": get_val(stanza_scores, "morph_acc"),
        "UAS": get_val(stanza_scores, "dep_uas"),
        "LAS": get_val(stanza_scores, "dep_dlas") if "dep_dlas" in stanza_scores else get_val(stanza_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(stanza_lemma_preds, gold_lemmas)
    },
    {
        "Model": "UDPipe (lv)",
        "POS": get_val(udpipe_scores, "pos_acc"),
        "Tag": get_val(udpipe_scores, "tag_acc"),
        "Morph": get_val(udpipe_scores, "morph_acc"),
        "UAS": get_val(udpipe_scores, "dep_uas"),
        "LAS": get_val(udpipe_scores, "dep_dlas") if "dep_dlas" in udpipe_scores else get_val(udpipe_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(udpipe_lemma_preds, gold_lemmas)
    }
])

# ---------- 9. Print results ----------
pd.set_option("display.precision", 4)
print(df)

Loaded 2396 gold docs. Example first text:
Lai arī viņš tiešām piedzīvoja traģisku galu un viņš savas gleznas nevarēja pārdot, ir jāatzīmē, ka lielāku savas dzīves daļu viņš pavadīja kā mākslas skolotājs un gleznu tirgotājs. 



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 434kB [00:00, 6.10MB/s]                    
2025-09-25 21:24:20 INFO: Downloaded file to /home/jesse/stanza_resources/resources.json
2025-09-25 21:24:20 INFO: Downloading default packages for language: lv (Latvian) ...
2025-09-25 21:24:21 INFO: File exists: /home/jesse/stanza_resources/lv/default.zip
2025-09-25 21:24:21 INFO: Finished downloading models and saved to /home/jesse/stanza_resources
2025-09-25 21:24:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 434kB [00:00, 1.93MB/s]                    
2025-09-25 21:24:22 INFO: Downloaded file to /home/jesse/stanza_resources/resources.json
2025-09-25 21:24:23 INFO: Loading these models fo

                  Model     POS     Tag   Morph     UAS     LAS  Lemma Acc
0  spaCy (lv_spacy_cnn)  0.9457  0.8665  0.9046  0.8076  0.7509     0.8203
1           Stanza (lv)  0.9688  0.8987  0.9449  0.8791  0.8354     0.9539
2           UDPipe (lv)  0.9207  0.7960  0.3403  0.0791  0.0660     0.8911


In [17]:
# =====================================================
# Cell 13: Test downloading model from Huggingface hub
# =====================================================
import spacy
import stanza
import spacy_udpipe
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.scorer import Scorer
import pandas as pd
from huggingface_hub import snapshot_download
      
# Load the pipeline
model_dir = snapshot_download(repo_id="JesseHuang922/lv_spacy_cnn", repo_type="model")
nlp_spacy = spacy.load(model_dir)
# ---------- 2. Load gold test set using the model's vocab ----------
# IMPORTANT: use nlp_spacy.vocab so gold docs share the same StringStore as predictions
doc_bin = DocBin().from_disk("test/lv_lvtb-ud-test.spacy")
gold_docs = list(doc_bin.get_docs(nlp_spacy.vocab))

# quick sanity check
print(f"Loaded {len(gold_docs)} gold docs. Example first text:\n{gold_docs[0].text[:200]}\n")

# ---------- 3. Evaluate spaCy model (using Scorer) ----------
# Produce spaCy predictions (these use nlp_spacy.vocab too)
pred_docs_spacy = [nlp_spacy(d.text) for d in gold_docs]
examples_spacy = [Example(pred, gold) for pred, gold in zip(pred_docs_spacy, gold_docs)]
scorer_spacy = Scorer()
spacy_scores = scorer_spacy.score(examples_spacy)

# ---------- 4. Stanza model: build predicted docs on the SAME vocab and evaluate ----------
stanza.download("lv", processors=None)  # will reuse cached; safe to call
nlp_stanza = stanza.Pipeline("lv", processors="tokenize,pos,lemma,depparse", use_gpu=True)

examples_stanza = []
stanza_lemma_preds = []  # keep lemmas for lemma-accuracy calc
for gold in gold_docs:
    stanza_doc = nlp_stanza(gold.text)
    words = [w.text for s in stanza_doc.sentences for w in s.words]
    # create predicted Doc using the SAME vocab
    pred_doc = spacy.tokens.Doc(nlp_spacy.vocab, words=words)
    # copy annotations from stanza into pred_doc
    stanza_tokens = [w for s in stanza_doc.sentences for w in s.words]
    for token, w in zip(pred_doc, stanza_tokens):
        token.pos_ = w.upos
        token.tag_ = w.xpos if w.xpos else w.upos
        token.lemma_ = w.lemma
        token.set_morph(w.feats if w.feats else "")
        token.dep_ = w.deprel
        # head is index-based in stanza; map to pred_doc tokens
        token.head = pred_doc[w.head - 1] if w.head > 0 else token
    examples_stanza.append(Example(pred_doc, gold))
    stanza_lemma_preds.append([t.lemma_ for t in pred_doc])

scorer_stanza = Scorer()
stanza_scores = scorer_stanza.score(examples_stanza)

# ---------- 5. UDPipe model: build predicted docs on the SAME vocab and evaluate ----------
"""
The UDPipe model 'latvian-lv0ud-2.5-191206.udpipe' can be downloaded from:
https://lindat.mff.cuni.cz/repository/items/41f05304-629f-4313-b9cf-9eeb0a2ca7c6

Please download the model and place it under the '/test/' directory if you wish to run evaluations.
For better comparison, you may also check for newer versions of the model.
"""

udpipe_path = "test/latvian-lvtb-ud-2.5-191206.udpipe"
nlp_udpipe = spacy_udpipe.load_from_path(lang="lv", path=udpipe_path)

examples_udpipe = []
udpipe_lemma_preds = []
for gold in gold_docs:
    udpipe_doc = nlp_udpipe(gold.text)
    words = [t.text for t in udpipe_doc]
    pred_doc = spacy.tokens.Doc(nlp_spacy.vocab, words=words)
    for token, t in zip(pred_doc, udpipe_doc):
        token.pos_ = t.pos_
        token.tag_ = t.tag_ if t.tag_ else t.pos_
        token.lemma_ = t.lemma_
        token.set_morph("")  # UDPipe token may not expose FEATS via spacy_udpipe
        token.dep_ = t.dep_
        token.head = pred_doc[t.head - 1] if t.head > 0 else token
    examples_udpipe.append(Example(pred_doc, gold))
    udpipe_lemma_preds.append([t.lemma_ for t in pred_doc])

scorer_udpipe = Scorer()
udpipe_scores = scorer_udpipe.score(examples_udpipe)

# ---------- 6. Prepare spaCy lemma preds for comparison ----------
spacy_lemma_preds = [[token.lemma_ for token in pred] for pred in pred_docs_spacy]

# ---------- 7. Helper functions ----------
def get_val(d, key):
    """Return value for key or NaN if missing."""
    return d.get(key, float("nan"))

def lemma_accuracy(preds, golds):
    """Token-level lemma accuracy (simple 1:1 token alignment)."""
    total, correct = 0, 0
    for p_seq, g_seq in zip(preds, golds):
        for p, g in zip(p_seq, g_seq):
            total += 1
            if p == g:
                correct += 1
    return correct / total if total > 0 else float("nan")

# extract gold lemmas from gold_docs
gold_lemmas = [[token.lemma_ for token in doc] for doc in gold_docs]

# ---------- 8. Compile results ----------
df = pd.DataFrame([
    {
        "Model": "spaCy (lv_roberta_large)",
        "POS": get_val(spacy_scores, "pos_acc"),
        "Tag": get_val(spacy_scores, "tag_acc"),
        "Morph": get_val(spacy_scores, "morph_acc"),
        "UAS": get_val(spacy_scores, "dep_uas"),
        "LAS": get_val(spacy_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(spacy_lemma_preds, gold_lemmas)
    },
    {
        "Model": "Stanza (lv)",
        "POS": get_val(stanza_scores, "pos_acc"),
        "Tag": get_val(stanza_scores, "tag_acc"),
        "Morph": get_val(stanza_scores, "morph_acc"),
        "UAS": get_val(stanza_scores, "dep_uas"),
        "LAS": get_val(stanza_scores, "dep_dlas") if "dep_dlas" in stanza_scores else get_val(stanza_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(stanza_lemma_preds, gold_lemmas)
    },
    {
        "Model": "UDPipe (lv)",
        "POS": get_val(udpipe_scores, "pos_acc"),
        "Tag": get_val(udpipe_scores, "tag_acc"),
        "Morph": get_val(udpipe_scores, "morph_acc"),
        "UAS": get_val(udpipe_scores, "dep_uas"),
        "LAS": get_val(udpipe_scores, "dep_dlas") if "dep_dlas" in udpipe_scores else get_val(udpipe_scores, "dep_las"),
        "Lemma Acc": lemma_accuracy(udpipe_lemma_preds, gold_lemmas)
    }
])

# ---------- 9. Print results ----------
pd.set_option("display.precision", 4)
print(df)

Fetching 24 files: 100%|██████████| 24/24 [00:04<00:00,  4.92it/s]


Loaded 2396 gold docs. Example first text:
Lai arī viņš tiešām piedzīvoja traģisku galu un viņš savas gleznas nevarēja pārdot, ir jāatzīmē, ka lielāku savas dzīves daļu viņš pavadīja kā mākslas skolotājs un gleznu tirgotājs. 



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 434kB [00:00, 8.29MB/s]                    
2025-09-25 21:55:10 INFO: Downloaded file to /home/jesse/stanza_resources/resources.json
2025-09-25 21:55:10 INFO: Downloading default packages for language: lv (Latvian) ...
2025-09-25 21:55:10 INFO: File exists: /home/jesse/stanza_resources/lv/default.zip
2025-09-25 21:55:11 INFO: Finished downloading models and saved to /home/jesse/stanza_resources
2025-09-25 21:55:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 434kB [00:00, 4.25MB/s]                    
2025-09-25 21:55:12 INFO: Downloaded file to /home/jesse/stanza_resources/resources.json
2025-09-25 21:55:12 INFO: Loading these models fo

                      Model     POS     Tag   Morph     UAS     LAS  Lemma Acc
0  spaCy (lv_roberta_large)  0.9457  0.8665  0.9046  0.8076  0.7509     0.8203
1               Stanza (lv)  0.9688  0.8987  0.9449  0.8791  0.8354     0.9539
2               UDPipe (lv)  0.9207  0.7960  0.3403  0.0791  0.0660     0.8911
