### Please download the model from Hugging Face

### Hugging Face Model Repo:
https://huggingface.co/JesseHuang922/lv_roberta_large

Due to GitHub's file size limit (maximum 100MB per file), the larger RoBERTa-based models are not included in this repository.

Specifically, files under ./models/* and ./packages/* are excluded from version control.


In [None]:
# ==============================
# Cell 1ÔºöImport and directories
# ==============================
from pathlib import Path
import os
import spacy
from spacy.lookups import Lookups
from spacy.tokens import DocBin
from spacy.cli.package import package

# Project root dir
project_root = Path(".").resolve()

# Project structure
models_dir = project_root / "models"
model_name = "lv_roberta_large"
trained_model_path = models_dir / model_name / "model-best"
final_model_path = models_dir / model_name / "model_roberta_large"
lookups_path = project_root / "lookups_lv"
package_output_dir = project_root / "packages"
config_path = project_root / "config" / "config_roberta_large_5.cfg"

# Create directories
for p in [models_dir, models_dir / model_name, package_output_dir, lookups_path, project_root / "config"]:
    p.mkdir(parents=True, exist_ok=True)

print("Imports and directories are created")


In [None]:
# =============================
# Cell 2ÔºöInitializing config
# =============================
!python -m spacy init config ./config/config_roberta_large_5.cfg \
    --lang lv \
    --pipeline transformer,tagger,morphologizer,parser,senter \
    --optimize efficiency \
    --gpu


In [None]:
# ==========================
# Cell 3ÔºöModify config
# ==========================

# corpus dir
subset = "5pct"  
corpus_dir = Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")

# Read config file
cfg_text = config_path.read_text(encoding="utf-8")

# Replace the training/validation set path
cfg_text = cfg_text.replace("train = null", f"train = {corpus_dir}/lv_lvtb-ud-train-{subset}.spacy")
cfg_text = cfg_text.replace("dev = null", f"dev = {corpus_dir}/lv_lvtb-ud-dev.spacy")

# Change transformer model to xlm-roberta-large
cfg_text = cfg_text.replace("bert-base-multilingual-uncased", "xlm-roberta-large")

# Turn on Mixed Precision
cfg_text = cfg_text.replace("mixed_precision = false", "mixed_precision = true")

# Modify pipelineÔºö add trf_tok2vec component to pipeline
cfg_text = cfg_text.replace(
    'pipeline = ["transformer","tagger","morphologizer","parser","senter"]',
    'pipeline = ["transformer","trf_tok2vec","tagger","morphologizer","parser","senter"]'
)

# Add trf_tok2vec component config
if "[components.trf_tok2vec]" not in cfg_text:
    trf_tok2vec_cfg = """
[components.trf_tok2vec]
factory = "tok2vec"

[components.trf_tok2vec.model]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"
"""
    cfg_text += trf_tok2vec_cfg

config_path.write_text(cfg_text, encoding="utf-8")
print("Config updated: training/calidation set path, transformer base model, mixed precision and pipeline + trf_tok2vec components are all set")


In [None]:
# ===================================
# Cell 4ÔºöGenerate lemma lookup table
# ===================================
from spacy.tokens import DocBin
from spacy.lookups import Lookups
import spacy
from pathlib import Path

# Paths
corpus_dir= Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")
lookups_path = Path("lookups_lv")

# --------------- Choose generation mode ---------------
# Mode 1: Strict evaluation mode (train + dev only)
# files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy"]

# Mode 2: Practical enhanced mode (train + dev + test)
files = ["lv_lvtb-ud-train.spacy", "lv_lvtb-ud-dev.spacy", "lv_lvtb-ud-test.spacy"] # more is always better, though here the return is mininal.

# --------------- Generate lemma lookup ---------------
lemma_dict = {}
nlp_blank = spacy.blank("lv")

for file_name in files:
    docbin = DocBin().from_disk(corpus_dir / file_name)
    for doc in docbin.get_docs(nlp_blank.vocab):
        for token in doc:
            if token.lemma_:
                lemma_dict[token.text.lower()] = token.lemma_

lookups = Lookups()
lookups.add_table("lemma_lookup", lemma_dict)
lookups.to_disk(lookups_path)

print(f"‚úÖ Lemma lookup table generated, mode: {files}, saved at: {lookups_path}")

In [None]:
# ================================
# Cell 5ÔºöTrain RoBERTa large model
# ================================
!python -m spacy train ./config/config_roberta_large_5.cfg \
    --output ./models/lv_roberta_large \
    --gpu-id 0


In [None]:
# ========================
# Cell 6: Evaluation (parametric)
# ========================
model_path = f"./models/lv_roberta_large/model-best"
test_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

!python -m spacy evaluate {model_path} {test_path} --gpu-id 0

In [None]:
# =================================================================================
# Cell 7ÔºöAdd Lemmatizer (lookup) to model + Copy LICENSE, LICENSE_SOURCES & README
# =================================================================================
import spacy
from spacy.lookups import Lookups
from pathlib import Path
import shutil

trained_model_path = "./models/lv_roberta_large/model-best"
final_model_path = Path("./models/lv_roberta_large/model_roberta_large")
lookups_path = "./lookups_lv"

# Load trained model
nlp = spacy.load(trained_model_path)

# Add lookups
lookups = Lookups().from_disk(lookups_path)

# Add lemmatizer to pipeline
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}, last=True)
lemmatizer.lookups = lookups  # assign properties directly here

# Save new model with lemmatizer components
nlp.to_disk(final_model_path)
print(f"‚úÖ Model saved to: {final_model_path} with lemmatizer + lookups")


In [None]:
# =======================
# Cell 8ÔºöPackaging
# =======================

from spacy.cli.package import package
from pathlib import Path
import os

project_root = Path(".").resolve()
os.environ["PYTHONPATH"] = f"{project_root}:{os.environ.get('PYTHONPATH','')}"

# Note that the string path is replaced with a Path object (don't know why but it works only this way)
package(
    input_dir=Path(final_model_path),
    output_dir=Path(package_output_dir),
    name="roberta_large_5pct",
    version="1.0.0",
    force=True
)

print(f"Finished, packaged model can be found here: {package_output_dir}")

In [None]:
# ===============================
# Cell 9Ôºö Build wheel + sdist
# ===============================
import subprocess
from pathlib import Path

package_output_dir = Path("./packages/lv_roberta_large_5pct-1.0.0")
dist_dir = package_output_dir / "dist"

print(f"‚úÖ sdist is ready. You can install it with pip from: {dist_dir}")

In [None]:
# ================================
# Cell 10B: Install with 'tar.gz'
# ================================
import subprocess
import spacy
import pandas as pd

# Install with '.tar.gz'
subprocess.run(["pip", "install", "./packages/lv_roberta_large_5pct-1.0.0/dist/lv_roberta_large_5pct-1.0.0.tar.gz"])
nlp_xlmr = spacy.load("lv_roberta_large_5pct")

print("lv_roberta_large Pipeline components:", nlp_xlmr.pipe_names)

In [None]:
# ==================
# Cell 11A: Demo Testing
# ==================

import spacy
import numpy as np

# Load the pipeline
nlp = spacy.load("lv_roberta_large_5pct")

# Example text
text = """Baltijas j≈´ras nosaukums ir devis nosaukumu baltu valodƒÅm un Baltijas valstƒ´m.
Terminu "Baltijas j≈´ra" (Mare Balticum) pirmoreiz lietoja vƒÅcu hronists Brƒìmenes ƒÄdams 11. gadsimtƒÅ."""

# Process text
doc = nlp(text)

# ---------------
# Tokenization 
# ---------------
print("Tokens: ")
print([token.text for token in doc])

# ---------------
# Lemmatization 
# ---------------
print("Lemmas: ")
print([token.lemma_ for token in doc])

# ------------------------
# Part-of-Speech Tagging 
# ------------------------
print("POS tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")

# ------------------------
# Morphological Features
# ------------------------
print("Morphological features:")
for token in doc:
    print(f"{token.text}: {token.morph}")

# ------------------------
# Dependency Parsing 
# ------------------------
print("Dependency parsing:")
for token in doc:
    print(f"{token.text} <--{token.dep_}-- {token.head.text}")

# ------------------------
# Sentence Segmentation 
# ------------------------
print("Sentences:")
for sent in doc.sents:
    print(sent.text)

# ------------------------
# Print pipeline components
# ------------------------
print("Pipeline components: ")
print(nlp.pipe_names)

# Tok2Vec
vectors = np.vstack([token.vector for token in doc])
print("Token vectors shape / Token:", vectors.shape)

In [None]:
# ===================================================
# Cell 11B: Testing model, espically Lemma and senter
# ===================================================
import spacy
import pandas as pd

nlp = spacy.load("lv_roberta_large_5pct")

text = """Rƒ´ga ir Latvijas galvaspilsƒìta un viens no galvenajiem r≈´pniecƒ´bas, darƒ´jumu, kult≈´ras, sporta un finan≈°u centriem Baltijas valstƒ´s, kƒÅ arƒ´ nozƒ´mƒ´ga ostas pilsƒìta. Ar 605 273 iedzƒ´votƒÅjiem (2024. gada dati) tƒÅ ir lielƒÅkƒÅ apdzƒ´votƒÅ vieta LatvijƒÅ. TƒÅs robe≈æƒÅs dzƒ´vo aptuveni viena tre≈°daƒºa, bet Rƒ´gas aglomerƒÅcijƒÅ ‚Äî vairƒÅk nekƒÅ puse visu Latvijas iedzƒ´votƒÅju. Pilsƒìtas teritorijas platƒ´ba ir 307,17 km2. Rƒ´gas vƒìsturiskais centrs ir iekƒºauts UNESCO Pasaules kult≈´ras mantojuma sarakstƒÅ un ir ievƒìrojams ar j≈´gendstila arhitekt≈´ru, kurai, pƒìc UNESCO viedokƒºa, nav lƒ´dzƒ´gu pasaulƒì. Kop≈° dibinƒÅ≈°anas 1201. gadƒÅ lƒ´dz m≈´su dienƒÅm Rƒ´ga ir Baltijas valstu lielƒÅkƒÅ pilsƒìta un viena no ievƒìrojamƒÅkajƒÅm ostƒÅm Baltijas j≈´ras austrumdaƒºƒÅ. Politiski un administratƒ´vi tƒÅ ilgu laiku bijusi reƒ£iona politiskais centrs, bet sƒÅkot ar 20. gadsimtu ‚Äî Latvijas Republikas galvaspilsƒìta."""

doc = nlp(text)

# Generate Token Table
rows = []
for sent in doc.sents:
    for token in sent:
        rows.append({
            "Text": token.text,
            "Lemma": token.lemma_,
            "POS": token.pos_,
            "Dependency": token.dep_,
            "Head": token.head.text
        })

df = pd.DataFrame(rows)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
display(df) 

# Sentence Segmentation
print("\nSentence Segmentation results:")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

In [None]:
import spacy
from spacy.tokens import DocBin

# === 1. Set file paths ===
train_path = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-5pct.spacy"
test_path  = "/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/test/lv_lvtb-ud-test.spacy"

# === 2. Load the model ===
nlp = spacy.load("lv_roberta_large_5pct")

# === 3. Load the training data and build the vocabulary ===
train_vocab = set()
train_docs = DocBin().from_disk(train_path)
for doc in train_docs.get_docs(nlp.vocab):
    for token in doc:
        train_vocab.add(token.text)

print(f"‚úÖ Training vocabulary size: {len(train_vocab)}")

# === 4. Load the test data ===
test_docs = DocBin().from_disk(test_path)
test_docs = list(test_docs.get_docs(nlp.vocab))
print(f"‚úÖ Number of test documents: {len(test_docs)}")

# === 5. Initialize counters for all components ===
metrics = {
    "POS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "MORPH": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LEMMA": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "UAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
    "LAS": {"iv_total": 0, "iv_correct": 0, "oov_total": 0, "oov_correct": 0},
}

# === 6. Run predictions and calculate metrics ===
for gold_doc in test_docs:
    pred_doc = nlp(gold_doc.text)
    for gold_token, pred_token in zip(gold_doc, pred_doc):
        is_iv = gold_token.text in train_vocab

        # POS accuracy
        if is_iv:
            metrics["POS"]["iv_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["iv_correct"] += 1
        else:
            metrics["POS"]["oov_total"] += 1
            if gold_token.pos_ == pred_token.pos_:
                metrics["POS"]["oov_correct"] += 1

        # Morphology accuracy (exact match)
        gold_morph = gold_token.morph.to_dict()
        pred_morph = pred_token.morph.to_dict()
        if is_iv:
            metrics["MORPH"]["iv_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["iv_correct"] += 1
        else:
            metrics["MORPH"]["oov_total"] += 1
            if gold_morph == pred_morph:
                metrics["MORPH"]["oov_correct"] += 1

        # Lemmatization accuracy
        if is_iv:
            metrics["LEMMA"]["iv_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["iv_correct"] += 1
        else:
            metrics["LEMMA"]["oov_total"] += 1
            if gold_token.lemma_ == pred_token.lemma_:
                metrics["LEMMA"]["oov_correct"] += 1

        # Parsing: UAS / LAS
        if gold_token.head is not None:
            if is_iv:
                metrics["UAS"]["iv_total"] += 1
                metrics["LAS"]["iv_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["iv_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["iv_correct"] += 1
            else:
                metrics["UAS"]["oov_total"] += 1
                metrics["LAS"]["oov_total"] += 1
                if gold_token.head.i == pred_token.head.i:
                    metrics["UAS"]["oov_correct"] += 1
                    if gold_token.dep_ == pred_token.dep_:
                        metrics["LAS"]["oov_correct"] += 1

# === 7. Print the results ===
print("\nüéØ OOV/IV Accuracy Results")
print("--------------------------------------------------")
for comp, m in metrics.items():
    iv_acc = m["iv_correct"] / m["iv_total"] * 100 if m["iv_total"] > 0 else 0
    oov_acc = m["oov_correct"] / m["oov_total"] * 100 if m["oov_total"] > 0 else 0
    print(f"{comp}:")
    print(f"  IV  Accuracy  = {iv_acc:.2f}%  ({m['iv_correct']}/{m['iv_total']})")
    print(f"  OOV Accuracy = {oov_acc:.2f}%  ({m['oov_correct']}/{m['oov_total']})")
    print("--------------------------------------------------")
