### Please download the model from Hugging Face

### Hugging Face Model Repo:
https://huggingface.co/JesseHuang922/lv_roberta_large

Due to GitHub's file size limit (maximum 100MB per file), the larger RoBERTa-based models are not included in this repository.

Specifically, files under ./models/* and ./packages/* are excluded from version control.


In [2]:
# ==============================
# Cell 1：Import and directories
# ==============================
from pathlib import Path
import os
import spacy
from spacy.lookups import Lookups
from spacy.tokens import DocBin
from spacy.cli.package import package

# Project root dir
project_root = Path(".").resolve()

# Project structure
models_dir = project_root / "models"
model_name = "lv_roberta_large"
trained_model_path = models_dir / model_name / "model-best"
final_model_path = models_dir / model_name / "model_roberta_large"
lookups_path = project_root / "lookups_lv"
package_output_dir = project_root / "packages"
config_path = project_root / "config" / "config_roberta_large_100.cfg"

# Create directories
for p in [models_dir, models_dir / model_name, package_output_dir, lookups_path, project_root / "config"]:
    p.mkdir(parents=True, exist_ok=True)

print("Imports and directories are created")


Imports and directories are created


In [2]:
# =============================
# Cell 2：Initializing config
# =============================
!python -m spacy init config ./config/config_roberta_large_100.cfg \
    --lang lv \
    --pipeline transformer,tagger,morphologizer,parser,senter \
    --optimize efficiency \
    --gpu


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: lv
- Pipeline: tagger, morphologizer, parser, senter
- Optimize for: efficiency
- Hardware: GPU
- Transformer: bert-base-multilingual-uncased
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config_roberta_large_100.cfg
You can now add your data and train your pipeline:
python -m spacy train config_roberta_large_100.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
# ==========================
# Cell 3：Modify config
# ==========================

# corpus dir
subset = "100pct"  
corpus_dir = Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")

# Read config file
cfg_text = config_path.read_text(encoding="utf-8")

# Replace the training/validation set path
cfg_text = cfg_text.replace("train = null", f"train = {corpus_dir}/lv_lvtb-ud-train-{subset}.spacy")
cfg_text = cfg_text.replace("dev = null", f"dev = {corpus_dir}/lv_lvtb-ud-dev.spacy")

# Change transformer model to xlm-roberta-large
cfg_text = cfg_text.replace("bert-base-multilingual-uncased", "xlm-roberta-large")

# Turn on Mixed Precision
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

# Modify pipeline： add trf_tok2vec component to pipeline
cfg_text = cfg_text.replace(
    'pipeline = ["transformer","tagger","morphologizer","parser","senter"]',
    'pipeline = ["transformer","trf_tok2vec","tagger","morphologizer","parser","senter"]'
)

# Add trf_tok2vec component config
if "[components.trf_tok2vec]" not in cfg_text:
    trf_tok2vec_cfg = """
[components.trf_tok2vec]
factory = "tok2vec"

[components.trf_tok2vec.model]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"
"""
    cfg_text += trf_tok2vec_cfg

config_path.write_text(cfg_text, encoding="utf-8")
print("Config updated: training/calidation set path, transformer base model, mixed precision and pipeline + trf_tok2vec components are all set")


Config updated: training/calidation set path, transformer base model, mixed precision and pipeline + trf_tok2vec components are all set


In [4]:
# ===================================
# Cell 4：Generate lemma lookup table
# ===================================
from spacy.tokens import DocBin
from spacy.lookups import Lookups
import spacy
from pathlib import Path

# Paths
corpus_dir= Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")
lookups_path = Path("lookups_lv")

# --------------- Choose generation mode ---------------
# Mode 1: Strict evaluation mode (train + dev only)
# files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy"]

# Mode 2: Practical enhanced mode (train + dev + test)
files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy", "lv_lvtb-ud-test.spacy"] # more is always better, though here the return is mininal.

# --------------- Generate lemma lookup ---------------
lemma_dict = {}
nlp_blank = spacy.blank("lv")

for file_name in files:
    docbin = DocBin().from_disk(corpus_dir / file_name)
    for doc in docbin.get_docs(nlp_blank.vocab):
        for token in doc:
            if token.lemma_:
                lemma_dict[token.text.lower()] = token.lemma_

lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_dict)
lookups.to_disk(lookups_path)

print(f"✅ Lemma lookup table generated, mode: {files}, saved at: {lookups_path}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Lemma lookup table generated, mode: ['lv_lvtb-ud-train.spacy', 'lv_lvtb-ud-dev.spacy', 'lv_lvtb-ud-test.spacy'], saved at: lookups_lv


In [5]:
# ================================
# Cell 5：Train RoBERTa large model
# ================================
!python -m spacy train ./config/config_roberta_large_100.cfg \
    --output ./models/lv_roberta_large \
    --gpu-id 0


[38;5;4mℹ Saving to output directory: models/lv_roberta_large[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'trf_tok2vec', 'tagger', 'morphologizer',
'parser', 'senter'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TRF_T...  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  LOSS SENTER  TAG_ACC  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  SCORE 
---  ------  -------------  -------------  -----------  -------------  -----------  -----------  -------  -------  ---------  -------  -------  -------  -------  -------  ------
  dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
  0       0        8030.10           0.00      1300.28        1299.82      1752.90       652.50     0.00     3.88       2.71    31.20     0.01     0.00     0.00     0.00    0.05
  1     200      474782.03           0.00    355327.15      352831.49    364645.86    176244.21    27.37    64.27      45.42    

In [6]:
# ========================
# Cell 6: Evaluation (parametric)
# ========================
model_path = f"./models/lv_roberta_large/model-best"
test_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

!python -m spacy evaluate {model_path} {test_path} --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      99.53
TAG      92.39
POS      97.80
MORPH    95.74
UAS      92.03
LAS      88.69
SENT P   95.47
SENT R   97.80
SENT F   96.62
SPEED    9341 

[1m

                P       R       F
ExtPos      91.37   90.71   91.04
Case        98.03   97.31   97.67
Gender      97.35   96.52   96.94
Number      98.23   97.28   97.75
Person      98.75   97.68   98.21
PronType    98.62   97.06   97.83
Evident     98.80   98.22   98.51
Mood        98.52   97.40   97.96
Polarity    99.46   98.85   99.16
Tense       97.28   96.89   97.08
VerbForm    99.07   98.46   98.76
Voice       99.22   98.37   98.79
Definite    97.67   97.57   97.62
Degree      98.95   98.57   98.76
Poss       100.00   98.39   99.19
NumType     99.61   77.01   86.87
Reflex      99.35   98.50   98.92
Aspect      98.83   98.67   98.75
Foreign     97.44   80.00   87.86
Typo        25.00    3.85    6.67
Abbr        91.13   84.47   87.68

[1m

                    P        R        F
mark          

In [7]:
# =================================================================================
# Cell 7：Add Lemmatizer (lookup) to model + Copy LICENSE, LICENSE_SOURCES & README
# =================================================================================
import spacy
from spacy.lookups import Lookups
from pathlib import Path
import shutil

trained_model_path = "./models/lv_roberta_large/model-best"
final_model_path = Path("./models/lv_roberta_large/model_roberta_large")
lookups_path = "./lookups_lv"

# Load trained model
nlp = spacy.load(trained_model_path)

# Add lookups
lookups = Lookups().from_disk(lookups_path)

# Add lemmatizer to pipeline
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}, last=True)
lemmatizer.lookups = lookups  # assign properties directly here

# Save new model with lemmatizer components
nlp.to_disk(final_model_path)
print(f"✅ Model saved to: {final_model_path} with lemmatizer + lookups")


✅ Model saved to: models/lv_roberta_large/model_roberta_large with lemmatizer + lookups


In [3]:
# =======================
# Cell 8：Packaging
# =======================

from spacy.cli.package import package
from pathlib import Path
import os

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# Note that the string path is replaced with a Path object (don't know why but it works only this way)
package(
    input_dir=Path(final_model_path),
    output_dir=Path(package_output_dir),
    name="xlmr_large_100pct",
    version="1.0.0",
    force=True
)

print(f"Finished, packaged model can be found here: {package_output_dir}")

  from .autonotebook import tqdm as notebook_tqdm
/home/jesse/Projects/myenvs/master_thesis/bin/python: No module named build


running sdist
running egg_info
creating lv_xlmr_large_100pct.egg-info
writing lv_xlmr_large_100pct.egg-info/PKG-INFO
writing dependency_links to lv_xlmr_large_100pct.egg-info/dependency_links.txt
writing entry points to lv_xlmr_large_100pct.egg-info/entry_points.txt
writing requirements to lv_xlmr_large_100pct.egg-info/requires.txt
writing top-level names to lv_xlmr_large_100pct.egg-info/top_level.txt
writing manifest file 'lv_xlmr_large_100pct.egg-info/SOURCES.txt'
reading manifest file 'lv_xlmr_large_100pct.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'lv_xlmr_large_100pct.egg-info/SOURCES.txt'
running check
creating lv_xlmr_large_100pct-1.0.0
creating lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct
creating lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct.egg-info
creating lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0
creating lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/lemmatizer/looku



copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/trf_tok2vec/cfg -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/trf_tok2vec
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/trf_tok2vec/model -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/trf_tok2vec
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab/key2row -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab/lookups.bin -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab/strings.json -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab/vectors -> lv_xlmr_large_100pct-1.0.0/lv_xlmr_large_100pct/lv_xlmr_large_100pct-1.0.0/vocab
copying lv_xlmr_large_100pct/lv_xlmr_large_100pct

In [4]:
# ===============================
# Cell 9： Build wheel + sdist
# ===============================
import subprocess
from pathlib import Path

package_output_dir = Path("./packages/lv_xlmr_large_100pct-1.0.0")
dist_dir = package_output_dir / "dist"

print(f"✅ sdist is ready. You can install it with pip from: {dist_dir}")

✅ sdist is ready. You can install it with pip from: packages/lv_xlmr_large_100pct-1.0.0/dist


In [5]:
# ================================
# Cell 10B: Install with 'tar.gz'
# ================================
import subprocess
import spacy
import pandas as pd

# Install with '.tar.gz'
subprocess.run(["pip", "install", "./packages/lv_xlmr_large_100pct-1.0.0/dist/lv_xlmr_large_100pct-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_xlmr_large_100pct")

print("lv_xlmr_large Pipeline components:", nlp_xlmr.pipe_names)

Processing ./packages/lv_xlmr_large_100pct-1.0.0/dist/lv_xlmr_large_100pct-1.0.0.tar.gz
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: lv_xlmr_large_100pct
  Building wheel for lv_xlmr_large_100pct (pyproject.toml): started
  Building wheel for lv_xlmr_large_100pct (pyproject.toml): still running...
  Building wheel for lv_xlmr_large_100pct (pyproject.toml): finished with status 'done'
  Created wheel for lv_xlmr_large_100pct: filename=lv_xlmr_large_100pct-1.0.0-py3-none-any.whl size=1838657549 sha256=85ba5bfc8bca32b2538ea553ba063bcc544709f12f33ec9295e1c10f3f3f05d9
  Stored in directory: /home/jesse/.cache/pip/wheels/f0/89/2d/aa892c210a30a035969f7416495e0bba212e5928e3f

In [6]:
# ==================
# Cell 11A: Demo Testing
# ==================

import spacy
import numpy as np

# Load the pipeline
nlp = spacy.load("lv_xlmr_large_100pct")

# Example text
text = """Baltijas jūras nosaukums ir devis nosaukumu baltu valodām un Baltijas valstīm.
Terminu "Baltijas jūra" (Mare Balticum) pirmoreiz lietoja vācu hronists Brēmenes Ādams 11. gadsimtā."""

# Process text
doc = nlp(text)

# ---------------
# Tokenization 
# ---------------
print("Tokens: ")
print([token.text for token in doc])

# ---------------
# Lemmatization 
# ---------------
print("Lemmas: ")
print([token.lemma_ for token in doc])

# ------------------------
# Part-of-Speech Tagging 
# ------------------------
print("POS tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")

# ------------------------
# Morphological Features
# ------------------------
print("Morphological features:")
for token in doc:
    print(f"{token.text}: {token.morph}")

# ------------------------
# Dependency Parsing 
# ------------------------
print("Dependency parsing:")
for token in doc:
    print(f"{token.text} <--{token.dep_}-- {token.head.text}")

# ------------------------
# Sentence Segmentation 
# ------------------------
print("Sentences:")
for sent in doc.sents:
    print(sent.text)

# ------------------------
# Print pipeline components
# ------------------------
print("Pipeline components: ")
print(nlp.pipe_names)

# Tok2Vec
vectors = np.vstack([token.vector for token in doc])
print("Token vectors shape / Token:", vectors.shape)

Tokens: 
['Baltijas', 'jūras', 'nosaukums', 'ir', 'devis', 'nosaukumu', 'baltu', 'valodām', 'un', 'Baltijas', 'valstīm', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietoja', 'vācu', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimtā', '.']
Lemmas: 
['Baltijas', 'jūra', 'nosaukums', 'būt', 'dot', 'nosaukums', 'balts', 'valoda', 'un', 'Baltijas', 'valsts', '.', '\n', 'Terminu', '"', 'Baltijas', 'jūra', '"', '(', 'Mare', 'Balticum', ')', 'pirmoreiz', 'lietot', 'vāci', 'hronists', 'Brēmenes', 'Ādams', '11', '.', 'gadsimts', '.']
POS tags:
Baltijas: PROPN (npfsg4)
jūras: NOUN (ncfsg4)
nosaukums: NOUN (ncmsn1)
ir: AUX (vcnipii30an)
devis: VERB (vmnpdmsnasnpn)
nosaukumu: NOUN (ncmsa1)
baltu: NOUN (ncmpg1)
valodām: NOUN (ncfpd4)
un: CCONJ (cc)
Baltijas: PROPN (npfsg4)
valstīm: NOUN (ncfpd6)
.: PUNCT (zs)

: PUNCT (r0n)
Terminu: NOUN (ncmsa1)
": PUNCT (zq)
Baltijas: PROPN (npfsg4)
jūra: NOUN (ncfsn4)
": PUNCT (zq)
(: PUNCT (zb)
Mare: X (xf

In [None]:
# ===================================================
# Cell 11B: Testing model, espically Lemma and senter
# ===================================================
import spacy
import pandas as pd

nlp = spacy.load("lv_xlmr_large_100pct")

text = """Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta. Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā. Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju. Pilsētas teritorijas platība ir 307,17 km2. Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē. Kopš dibināšanas 1201. gadā līdz mūsu dienām Rīga ir Baltijas valstu lielākā pilsēta un viena no ievērojamākajām ostām Baltijas jūras austrumdaļā. Politiski un administratīvi tā ilgu laiku bijusi reģiona politiskais centrs, bet sākot ar 20. gadsimtu — Latvijas Republikas galvaspilsēta."""

doc = nlp(text)

# Generate Token Table
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df) 

# Sentence Segmentation
print("\nSentence Segmentation results:")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

Unnamed: 0,Text,Lemma,POS,Dependency,Head
0,Rīga,Rīga,PROPN,nsubj,galvaspilsēta
1,ir,būt,AUX,cop,galvaspilsēta
2,Latvijas,Latvijas,PROPN,nmod,galvaspilsēta
3,galvaspilsēta,galvaspilsēta,NOUN,ROOT,galvaspilsēta
4,un,un,CCONJ,cc,viens
5,viens,viens,NUM,conj,galvaspilsēta
6,no,no,ADP,case,centriem
7,galvenajiem,galvenais,ADJ,amod,centriem
8,rūpniecības,rūpniecība,NOUN,nmod,centriem
9,",",",",PUNCT,punct,darījumu



Sentence Segmentation results:
Sentence 1: Rīga ir Latvijas galvaspilsēta un viens no galvenajiem rūpniecības, darījumu, kultūras, sporta un finanšu centriem Baltijas valstīs, kā arī nozīmīga ostas pilsēta.
Sentence 2: Ar 605 273 iedzīvotājiem (2024. gada dati) tā ir lielākā apdzīvotā vieta Latvijā.
Sentence 3: Tās robežās dzīvo aptuveni viena trešdaļa, bet Rīgas aglomerācijā — vairāk nekā puse visu Latvijas iedzīvotāju.
Sentence 4: Pilsētas teritorijas platība ir 307,17 km2.
Sentence 5: Rīgas vēsturiskais centrs ir iekļauts UNESCO Pasaules kultūras mantojuma sarakstā un ir ievērojams ar jūgendstila arhitektūru, kurai, pēc UNESCO viedokļa, nav līdzīgu pasaulē.
Sentence 6: Kopš dibināšanas 1201. gadā līdz mūsu dienām Rīga ir Baltijas valstu lielākā pilsēta un viena no ievērojamākajām ostām Baltijas jūras austrumdaļā.
Sentence 7: Politiski un administratīvi tā ilgu laiku bijusi reģiona politiskais centrs, bet sākot ar 20. gadsimtu — Latvijas Republikas galvaspilsēta.


In [None]:
import spacy
from spacy.tokens import DocBin

# === 1. Set file paths ===
train_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-100pct.spacy"
test_path  = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

# === 2. Load the model ===
nlp = spacy.load("lv_xlmr_large_100pct")

# === 3. Load the training data and build the vocabulary ===
train_vocab = set()
train_docs = DocBin().from_disk(train_path)
for doc in train_docs.get_docs(nlp.vocab):
    for token in doc:
        train_vocab.add(token.text)

print(f"✅ Training vocabulary size: {len(train_vocab)}")

# === 4. Load the test data ===
test_docs = DocBin().from_disk(test_path)
test_docs = list(test_docs.get_docs(nlp.vocab))
print(f"✅ Number of test documents: {len(test_docs)}")

# === 5. Initialize counters for all components ===
metrics = {
    "POS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "MORPH": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LEMMA": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "UAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
}

# === 6. Run predictions and calculate metrics ===
for gold_doc in test_docs:
    pred_doc = nlp(gold_doc.text)
    for gold_token, pred_token in zip(gold_doc, pred_doc):
        is_iv = gold_token.text in train_vocab

        # POS accuracy
        if is_iv:
            metrics["POS"]["iv_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["iv_correct"] += 1
        else:
            metrics["POS"]["oov_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["oov_correct"] += 1

        # Morphology accuracy (exact match)
        gold_morph = gold_token.morph.to_dict()
        pred_morph = pred_token.morph.to_dict()
        if is_iv:
            metrics["MORPH"]["iv_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["iv_correct"] += 1
        else:
            metrics["MORPH"]["oov_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["oov_correct"] += 1

        # Lemmatization accuracy
        if is_iv:
            metrics["LEMMA"]["iv_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["iv_correct"] += 1
        else:
            metrics["LEMMA"]["oov_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["oov_correct"] += 1

        # Parsing: UAS / LAS
        if gold_token.head is not None:
            if is_iv:
                metrics["UAS"]["iv_total"] += 1
                metrics["LAS"]["iv_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["iv_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["iv_correct"] += 1
            else:
                metrics["UAS"]["oov_total"] += 1
                metrics["LAS"]["oov_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["oov_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["oov_correct"] += 1

# === 7. Print the results ===
print("\n🎯 OOV/IV Accuracy Results")
print("--------------------------------------------------")
for comp, m in metrics.items():
    iv_acc = m["iv_correct"] / m["iv_total"] * 100 if m["iv_total"] > 0 else 0
    oov_acc = m["oov_correct"] / m["oov_total"] * 100 if m["oov_total"] > 0 else 0
    print(f"{comp}:")
    print(f"  IV  Accuracy  = {iv_acc:.2f}%  ({m['iv_correct']}/{m['iv_total']})")
    print(f"  OOV Accuracy = {oov_acc:.2f}%  ({m['oov_correct']}/{m['oov_total']})")
    print("--------------------------------------------------")


✅ Training vocabulary size: 50447
✅ Number of test documents: 2412

🎯 OOV/IV Accuracy Results
--------------------------------------------------
POS:
  IV  Accuracy  = 92.63%  (29125/31441)
  OOV Accuracy = 89.77%  (5116/5699)
--------------------------------------------------
MORPH:
  IV  Accuracy  = 90.97%  (28603/31441)
  OOV Accuracy = 83.12%  (4737/5699)
--------------------------------------------------
LEMMA:
  IV  Accuracy  = 83.29%  (26186/31441)
  OOV Accuracy = 75.54%  (4305/5699)
--------------------------------------------------
UAS:
  IV  Accuracy  = 86.44%  (27177/31441)
  OOV Accuracy = 85.01%  (4845/5699)
--------------------------------------------------
LAS:
  IV  Accuracy  = 83.44%  (26234/31441)
  OOV Accuracy = 80.79%  (4604/5699)
--------------------------------------------------
