### You can also download the model from Hugging Face

Hugging Face Model Repo:
https://huggingface.co/JesseHuang922/lv_spaCy_CNN



In [1]:
# ==============================
# Cell 1：Import and directories
# ==============================
from pathlib import Path
import os
import spacy
from spacy.lookups import Lookups
from spacy.tokens import DocBin
from spacy.cli.package import package

# Create root directory
project_root = Path(".").resolve()

# Project structure
models_dir = project_root / "models"
model_name = "lv_spacy_cnn"
trained_model_path = models_dir / model_name / "model-best"
final_model_path = models_dir / model_name / "model_lv_cnn"
lookups_path = project_root / "lookups_lv"
package_output_dir = project_root / "packages"
config_path = project_root / "config" / "config_spacy_cnn_5.cfg"

# Create directories
for p in [models_dir, models_dir / model_name, package_output_dir, lookups_path, project_root / "config"]:
    p.mkdir(parents=True, exist_ok=True)

print("Project structure has been set!")



Project structure has been set!


In [2]:
# ===================================================================================================================
# Cell 2：Initializing config (if you add -gpu, the backbone would be changed to transformer based pre-trained model)
# ===================================================================================================================
!python -m spacy init config ./config/config_spacy_cnn_5.cfg \
    --lang lv \
    --pipeline tok2vec,tagger,morphologizer,parser,senter \
    --optimize efficiency



[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_spacy_cnn_5.cfg
You can now add your data and train your pipeline:
python -m spacy train config_spacy_cnn_5.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
# ==========================
# Cell 3：Modify config
# ==========================

# corpus dir
subset = "5pct"  
corpus_dir = Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")

# Read config file
cfg_text = config_path.read_text(encoding="utf-8")

# Replace the training/validation set path
cfg_text = cfg_text.replace("train = null", f"train = {corpus_dir}/lv_lvtb-ud-train-{subset}.spacy")
cfg_text = cfg_text.replace("dev = null", f"dev = {corpus_dir}/lv_lvtb-ud-dev.spacy")

# # Turn on Mixed Precision
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

config_path.write_text(cfg_text, encoding="utf-8")
print("config updated!")




config updated!


In [4]:
# ===================================
# Cell 4：Generate lemma lookup table
# ===================================
from spacy.tokens import DocBin
from spacy.lookups import Lookups
import spacy
from pathlib import Path

# Paths
corpus_dir= Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")
lookups_path = Path("lookups_lv")

# --------------- Choose generation mode ---------------
# Mode 1: Strict evaluation mode (train + dev only)
# files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy"]

# Mode 2: Practical enhanced mode (train + dev + test)
files = ["lv_lvtb-ud-train-5pct.spacy"]

# --------------- Generate lemma lookup ---------------
lemma_dict = {}
nlp_blank = spacy.blank("lv")

for file_name in files:
    docbin = DocBin().from_disk(corpus_dir / file_name)
    for doc in docbin.get_docs(nlp_blank.vocab):
        for token in doc:
            if token.lemma_:
                lemma_dict[token.text.lower()] = token.lemma_

lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_dict)
lookups.to_disk(lookups_path)

print(f"✅ Lemma lookup table generated, mode: {files}, saved at: {lookups_path}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Lemma lookup table generated, mode: ['lv_lvtb-ud-train-5pct.spacy'], saved at: lookups_lv


In [5]:
# ==================================
# Cell 5：Train spaCy tok2vec model
# ==================================
!python -m spacy train ./config/config_spacy_cnn_5.cfg \
    --output ./models/lv_spacy_cnn \
    --gpu-id 0


[38;5;4mℹ Saving to output directory: models/lv_spacy_cnn[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser',
'senter'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  ------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  0       0          0.00       115.36         115.29       123.06        64.00    17.08    28.73      17.82     6.77     6.67     0.02     0.28     0.03    0.12
  2     200       2887.97     18482.44       17333.68     26263.98      2158.34    47.77    72.97      54.67    48.58    29.71    67.88    72.70    70.21    0.55
  5     400       5193.94     10690.48        8610.05     18910.60       128.04    58.71  

In [6]:
# ========================
# Cell 6: Evaluation (parametric)
# ========================
model_path = f"./models/lv_spacy_cnn/model-best"
test_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

!python -m spacy evaluate {model_path} {test_path} --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      65.93
POS      81.80
MORPH    72.13
UAS      57.11
LAS      43.04
SENT P   73.03
SENT R   84.99
SENT F   78.56
SPEED    14014

[1m

               P       R       F
ExtPos     80.95   60.71   69.39
Case       71.21   69.40   70.30
Gender     77.62   75.43   76.50
Number     79.15   76.20   77.64
Person     78.71   79.35   79.03
PronType   92.22   88.79   90.47
Evident    74.09   75.04   74.56
Mood       73.39   73.21   73.30
Polarity   77.72   74.38   76.01
Tense      69.53   64.88   67.12
VerbForm   72.19   69.23   70.68
Voice      74.41   68.92   71.56
Definite   65.36   49.13   56.09
Degree     71.19   58.09   63.98
Poss       96.27   69.35   80.62
NumType    76.90   48.21   59.27
Reflex     70.67   75.21   72.87
Aspect     67.49   51.13   58.19
Foreign     6.25    0.53    0.97
Typo        0.00    0.00    0.00
Abbr       97.06   15.07   26.09

[1m

                   P       R       F
mark           55.05   58.66   56.80
fi

In [7]:
# =================================================================================
# Cell 7：Add Lemmatizer (lookup) to model
# =================================================================================
import spacy
from spacy.lookups import Lookups
from pathlib import Path

trained_model_path = "./models/lv_spacy_cnn/model-best"
final_model_path = Path("./models/lv_spacy_cnn/model_lv_cnn")
lookups_path = "./lookups_lv"

# Load trained model
nlp = spacy.load(trained_model_path)

# Add lookups
lookups = Lookups().from_disk(lookups_path)

# Add lemmatizer to pipeline
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}, last=True)
lemmatizer.lookups = lookups  # assign properties directly here

# Save new model with lemmatizer components
nlp.to_disk(final_model_path)
print(f"✅ Model saved to: {final_model_path} with lemmatizer + lookups")


✅ Model saved to: models/lv_spacy_cnn/model_lv_cnn with lemmatizer + lookups


In [8]:
# =======================
# Cell 8：Packaging
# =======================

from spacy.cli.package import package
from pathlib import Path
import os

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# Packaging
package(
    input_dir=Path(final_model_path),
    output_dir=Path(package_output_dir),
    name="spacy_cnn_5pct",
    version="1.0.0",
    force=True
)

print(f"Finished, packaged model can be found here: {package_output_dir}")

/home/jesse/Projects/myenvs/master_thesis/bin/python: No module named build


running sdist
running egg_info
creating lv_spacy_cnn_5pct.egg-info
writing lv_spacy_cnn_5pct.egg-info/PKG-INFO
writing dependency_links to lv_spacy_cnn_5pct.egg-info/dependency_links.txt
writing entry points to lv_spacy_cnn_5pct.egg-info/entry_points.txt
writing top-level names to lv_spacy_cnn_5pct.egg-info/top_level.txt
writing manifest file 'lv_spacy_cnn_5pct.egg-info/SOURCES.txt'
reading manifest file 'lv_spacy_cnn_5pct.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_spacy_cnn_5pct.egg-info/SOURCES.txt'
running check
creating lv_spacy_cnn_5pct-1.0.0
creating lv_spacy_cnn_5pct-1.0.0/lv_spacy_cnn_5pct
creating lv_spacy_cnn_5pct-1.0.0/lv_spacy_cnn_5pct.egg-info
creating lv_spacy_cnn_5pct-1.0.0/lv_spacy_cnn_5pct/lv_spacy_cnn_5pct-1.0.0
creating lv_spacy_cnn_5pct-1.0.0/lv_spacy_cnn_5pct/lv_spacy_cnn_5pct-1.0.0/lemmatizer/lookups
creating lv_spacy_cnn_5pct-1.0.0/lv_spacy_cnn_5pct/lv_spacy_cnn_5pct-1.0.0/morphologizer
creating lv_spacy_cnn_5pct-1.0.0

In [9]:
# ===========================
# Cell 9：Package info (sdist ready)
# ===========================
from pathlib import Path

package_output_dir = Path("./packages/lv_spacy_cnn_5pct-1.0.0")
dist_dir = package_output_dir / "dist"

print(f"✅ sdist is ready. You can install it with pip from: {dist_dir}")


✅ sdist is ready. You can install it with pip from: packages/lv_spacy_cnn_5pct-1.0.0/dist


In [10]:
# ===============================
# Cell 10B: install with 'tar.gz'
# ===============================
import subprocess
import spacy
import pandas as pd

# install with '.tar.gz'
subprocess.run(["pip", "install", "./packages/lv_spacy_cnn_5pct-1.0.0/dist/lv_spacy_cnn_5pct-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_spacy_cnn_5pct")

print("lv_spacy_cnn Pipeline components:", nlp_xlmr.pipe_names)

Processing ./packages/lv_spacy_cnn_5pct-1.0.0/dist/lv_spacy_cnn_5pct-1.0.0.tar.gz
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: lv_spacy_cnn_5pct
  Building wheel for lv_spacy_cnn_5pct (pyproject.toml): started
  Building wheel for lv_spacy_cnn_5pct (pyproject.toml): finished with status 'done'
  Created wheel for lv_spacy_cnn_5pct: filename=lv_spacy_cnn_5pct-1.0.0-py3-none-any.whl size=8001305 sha256=2d6939ccb439d763d333d65eb170847c63505797758eaad7d8e4f7fa1e27102e
  Stored in directory: /home/jesse/.cache/pip/wheels/5f/f0/95/872a99bf34e33d8c7cd6e350de99b534ddf389e0d4c50c6d82
Successfully built lv_spacy_cnn_5pct
Installing collected packages: lv_spacy_cnn_5pct
Success

In [11]:
# ======================
# Cell 11A: Demo Testing
# ======================

import spacy
import numpy as np

# Load the pipeline
nlp = spacy.load("lv_spacy_cnn_5pct")

# Example text
text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā."""

# Process text
doc = nlp(text)

# ------------------------
# Tokenization 
# ------------------------
print("Tokens:")
print([token.text for token in doc])

# ------------------------
# Lemmatization / 词形还原
# ------------------------
print("Lemmas:")
print([token.lemma_ for token in doc])

# ------------------------
# Part-of-Speech Tagging
# ------------------------
print("POS tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")


# ------------------------
# Morphological Features 
# ------------------------
print("Morphological features:")
for token in doc:
    print(f"{token.text}: {token.morph}")


# ------------------------
# Dependency Parsing
# ------------------------
print("Dependency parsing: ")
for token in doc:
    print(f"{token.text} <--{token.dep_}-- {token.head.text}")


# ------------------------
# Sentence Segmentation 
# ------------------------
print("Sentences :")
for sent in doc.sents:
    print(sent.text)

# ------------------------
# Pipeline components
# ------------------------
print("Pipeline components:")
print(nlp.pipe_names)

# Tok2Vec 
vectors = np.vstack([token.vector for token in doc])
print("Token vectors shape:", vectors.shape)


Tokens:
['Baltijas', 'jūras', 'nosaukums', 'ir', 'devis', 'nosaukumu', 'baltu', 'valodām', 'un', 'Baltijas', 'valstīm', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietoja', 'vācu', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimtā', '.']
Lemmas:
['Baltijas', 'jūra', 'nosaukums', 'būt', 'devis', 'nosaukumu', 'balts', 'valodām', 'un', 'Baltijas', 'valstīm', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietoja', 'vāci', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimtā', '.']
POS tags:
Baltijas: PROPN (npfsg4)
jūras: NOUN (ncfsg4)
nosaukums: NOUN (ncmsn1)
ir: AUX (vcnipii30an)
devis: VERB (vmnpdmsnasnpn)
nosaukumu: NOUN (ncmpg1)
baltu: ADJ (affsanp)
valodām: NOUN (ncfsa4)
un: CCONJ (cc)
Baltijas: PROPN (npfsg4)
valstīm: NOUN (ncfpd4)
.: PUNCT (zs)

: NOUN (ncmsg1)
Terminu: PROPN (npmsg1)
": PUNCT (zq)
Baltijas: PROPN (npfsg4)
jūra: NOUN (ncfsn4)
": PUNCT (zq)
(: PUNCT (zb)
Mare:

In [12]:
# ===================================================
# Cell 11B: Testing model, espically Lemma and senter
# ===================================================
import spacy
import pandas as pd

nlp = spacy.load("lv_spacy_cnn_5pct")

text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm. Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā. Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji. Tomēr par šī vārda izcelsmi precīzu ziņu nav. Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu. Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi. Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’. Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā. Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168. m. ē.) izveidotajā Austrumeiropas kartē Baltijas jūra nosaukta par "Sarmatu jūru" (MARE SARMATICVM). Citos avotos minēts nosaukums OCEANUS SARMATICUS. Sarmati bija sena klejotāju tauta, kura runājusi indoirāņu valodā un līdz 4. gadsimtam dzīvoja Austrumeiropā. Tacits to dēvēja par "Svēbu jūru" (Mare Suebicum). Vēsturē pazīstami arī citu tautu dotie Baltijas jūras nosaukumi: "Varjagu jūra", "Barbaru jūra",[2] "Vendu jūra". Tā kādā 946. gada dokumentā to dēvēja par "Rūgu jūru" (no rūgu vārda cēlies tagadējās Rīgenes salas nosaukums),[3] bet Nestora hronikas ievadā Baltijas jūra nosaukta par Varjagu jūru.[4][5] Štumpfa (Stumpf) Eiropas kartē,[6] kas izdota Cīrihē, 1548. gadā (pārkopēta no vecākas Sebastiana Minstera veidotas kartes), Baltijas jūra tiek saukta par "Vācu jūru" (Das Deutsche Meer). Arī vecākā 1450. gadā Fra Mauro zīmētajā pasaules kartē redzams nosaukums "Ģermāņu jūra" vai pat okeāns, ko parasti piedēvē Ziemeļjūrai, tomēr minētajā kartē tā nepārprotami ir arī Baltijas jūra.[7] Latvijas piekrastes iedzīvotāji mēdza atklāto jūru dēvēt par "Dižjūru", bet Rīgas līci — par "Mazo jūru" (Mazjūru). Viduslaikos Rīgas līci latīniski dēvēja par "Līvu jūru" (Mare Livonicum). Senākajās Eiropas kartēs par jūrām tika saukti arī lielākie Baltijas jūras līči. Piemēram, Mare Finonicum sive Sinus Venedicus — tagadējais Somu jūras līcis."""

doc = nlp(text)

# Generate token table
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df)  

# Sentence segmentation
print("\nSentence Segmentation")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,Lemma,POS,Dependency,Head
0,Baltijas,Baltijas,PROPN,nmod,jūras
1,jūras,jūra,NOUN,nmod,nosaukums
2,nosaukums,nosaukums,NOUN,nsubj,valodām
3,ir,būt,AUX,cop,valodām
4,devis,devis,VERB,nmod,nosaukumu
5,nosaukumu,nosaukumu,NOUN,obj,baltu
6,baltu,balts,ADJ,amod,valodām
7,valodām,valodām,NOUN,ROOT,valodām
8,un,un,CCONJ,cc,valstīm
9,Baltijas,Baltijas,PROPN,nmod,valstīm



Sentence Segmentation
Sentence 1: Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Sentence 2: Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā.
Sentence 3: Viņš, kā jau vietējais hronists, pierakstījis nosaukumu, kuru lietoja iedzīvotāji.
Sentence 4: Tomēr par šī vārda izcelsmi precīzu ziņu nav.
Sentence 5: Tas varēja rasties, atsaucoties uz mītisko Ziemeļeiropā it kā esošo Baltijas salu.
Sentence 6: Pastāv arī iespēja, ka Ādams no Brēmenes veidojis šo vārdu no ģermāņu vārda belt, ar kuru tiek apzīmēti vairāki Dānijas šaurumi.
Sentence 7: Cits skaidrojums — vārds cēlies no protoindoeiropiešu valodas vārda saknes *bhel, kas nozīmē ‘balts’, ‘mirdzošs’.
Sentence 8: Šī vārda sakne saglabājusies arī vairākās mūsdienu indoeiropiešu valodās, tai skaitā latviešu valodā.
Sentence 9: Vienā no senākajām kartēm, kurā attēlota mūsdienu Latvijas teritorija — Aleksandrijas zinātnieka Klaudija Ptolemaja (ap 90.—168.
Sen

In [13]:
import spacy
from spacy.tokens import DocBin

# === 1. Set file paths ===
train_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-5pct.spacy"
test_path  = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

# === 2. Load the model ===
nlp = spacy.load("lv_spacy_cnn_5pct")

# === 3. Load the training data and build the vocabulary ===
train_vocab = set()
train_docs = DocBin().from_disk(train_path)
for doc in train_docs.get_docs(nlp.vocab):
    for token in doc:
        train_vocab.add(token.text)

print(f"✅ Training vocabulary size: {len(train_vocab)}")

# === 4. Load the test data ===
test_docs = DocBin().from_disk(test_path)
test_docs = list(test_docs.get_docs(nlp.vocab))
print(f"✅ Number of test documents: {len(test_docs)}")

# === 5. Initialize counters for all components ===
metrics = {
    "POS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "MORPH": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LEMMA": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "UAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
}

# === 6. Run predictions and calculate metrics ===
for gold_doc in test_docs:
    pred_doc = nlp(gold_doc.text)
    for gold_token, pred_token in zip(gold_doc, pred_doc):
        is_iv = gold_token.text in train_vocab

        # POS accuracy
        if is_iv:
            metrics["POS"]["iv_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["iv_correct"] += 1
        else:
            metrics["POS"]["oov_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["oov_correct"] += 1

        # Morphology accuracy (exact match)
        gold_morph = gold_token.morph.to_dict()
        pred_morph = pred_token.morph.to_dict()
        if is_iv:
            metrics["MORPH"]["iv_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["iv_correct"] += 1
        else:
            metrics["MORPH"]["oov_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["oov_correct"] += 1

        # Lemmatization accuracy
        if is_iv:
            metrics["LEMMA"]["iv_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["iv_correct"] += 1
        else:
            metrics["LEMMA"]["oov_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["oov_correct"] += 1

        # Parsing: UAS / LAS
        if gold_token.head is not None:
            if is_iv:
                metrics["UAS"]["iv_total"] += 1
                metrics["LAS"]["iv_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["iv_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["iv_correct"] += 1
            else:
                metrics["UAS"]["oov_total"] += 1
                metrics["LAS"]["oov_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["oov_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["oov_correct"] += 1

# === 7. Print the results ===
print("\n🎯 OOV/IV Accuracy Results")
print("--------------------------------------------------")
for comp, m in metrics.items():
    iv_acc = m["iv_correct"] / m["iv_total"] * 100 if m["iv_total"] > 0 else 0
    oov_acc = m["oov_correct"] / m["oov_total"] * 100 if m["oov_total"] > 0 else 0
    print(f"{comp}:")
    print(f"  IV  Accuracy  = {iv_acc:.2f}%  ({m['iv_correct']}/{m['iv_total']})")
    print(f"  OOV Accuracy = {oov_acc:.2f}%  ({m['oov_correct']}/{m['oov_total']})")
    print("--------------------------------------------------")



✅ Training vocabulary size: 4581
✅ Number of test documents: 2412

🎯 OOV/IV Accuracy Results
--------------------------------------------------
POS:
  IV  Accuracy  = 91.07%  (18919/20773)
  OOV Accuracy = 60.37%  (9880/16367)
--------------------------------------------------
MORPH:
  IV  Accuracy  = 89.42%  (18575/20773)
  OOV Accuracy = 41.04%  (6717/16367)
--------------------------------------------------
LEMMA:
  IV  Accuracy  = 85.82%  (17828/20773)
  OOV Accuracy = 20.85%  (3412/16367)
--------------------------------------------------
UAS:
  IV  Accuracy  = 59.28%  (12315/20773)
  OOV Accuracy = 47.61%  (7792/16367)
--------------------------------------------------
LAS:
  IV  Accuracy  = 51.93%  (10788/20773)
  OOV Accuracy = 30.33%  (4964/16367)
--------------------------------------------------
