In [1]:
# ======================================
# Cell 1：Convert conllu to spaCy format
# ======================================
!python -m spacy convert /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/UD_Latvian_LVTB/lv_lvtb-ud-train.conllu ./train -n 10 
!python -m spacy convert /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/UD_Latvian_LVTB/lv_lvtb-ud-dev.conllu ./train -n 10 
!python -m spacy convert /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/UD_Latvian_LVTB/lv_lvtb-ud-test.conllu ./train -n 10

# For testing
!python -m spacy convert /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/UD_Latvian_LVTB/lv_lvtb-ud-test.conllu ./test
"""
Using a lookup table for lemmatization matches words solely based on their surface form (or lowercase),
without considering context. In longer documents (multiple sentences or complex structures):

    - spaCy's lemmatization may be indirectly affected by pipeline processing and Vocab caching. 
      For example, repeated tokens or subtle variations in capitalization/punctuation can lead 
      to lookup misses.
    - Some compound or modified words might not exist in the lookup table.

As a result, longer documents increase the likelihood of lookup failures, reducing overall lemma accuracy.

To balance this, during training we group 10 sentences per Doc to provide richer context for
sentence segmentation learning. For evaluating lemma performance, however, we use a test set
with one sentence per Doc, which isolates lemma accuracy from potential inter-sentence effects.
"""



print("All conllu files are converted to spaCy Format.")


[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1506 documents):
train/lv_lvtb-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (211 documents): train/lv_lvtb-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (242 documents):
train/lv_lvtb-ud-test.spacy[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (2412 documents):
test/lv_lvtb-ud-test.spacy[0m
All conllu files are converted to spaCy Format.


In [3]:
# ===============================================
# Cell 2: Split training data (e.g., 50%, 20%)
# ===============================================
import spacy
from spacy.tokens import DocBin
from pathlib import Path

# corpus dir
corpus_train_path = Path("/home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train")

# Dir
train_path = corpus_train_path / "lv_lvtb-ud-train.spacy"

train_100_path = corpus_train_path / "lv_lvtb-ud-train-100pct.spacy"
train_50_path = corpus_train_path / "lv_lvtb-ud-train-50pct.spacy"
train_20_path = corpus_train_path / "lv_lvtb-ud-train-20pct.spacy"
train_5_path = corpus_train_path / "lv_lvtb-ud-train-5pct.spacy"

# Load corpora
nlp_blank = spacy.blank("lv") 
docbin = DocBin().from_disk(train_path)
docs = list(docbin.get_docs(nlp_blank.vocab))

# keep 100%
docs_100pct = docs

#save .spacy file
docbin_100 = DocBin(docs=docs_100pct)
docbin_100.to_disk(train_100_path)

# keep top 50%
half_len = len(docs) // 2
docs_50pct = docs[:half_len]

# save .spacy file
docbin_50 = DocBin(docs=docs_50pct)
docbin_50.to_disk(train_50_path)

# keep top 20%
fifth_len = len(docs) // 5
docs_20pct = docs[:fifth_len]

# save .spacy file
docbin_20 = DocBin(docs=docs_20pct)
docbin_20.to_disk(train_20_path)

# keep top 5%
fifth_len = len(docs) // 20
docs_5pct = docs[:fifth_len]

# save .spacy file
docbin_5 = DocBin(docs=docs_5pct)
docbin_5.to_disk(train_5_path)

print(f"✅ Saved first 50% of training data to: {train_50_path}")
print(f"Original samples: {len(docs)} | New subset: {len(docs_50pct)}")

print(f"✅ Saved first 20% of training data to: {train_20_path}")
print(f"Original samples: {len(docs)} | New subset: {len(docs_20pct)}")

print(f"✅ Saved first 5% of training data to: {train_5_path}")
print(f"Original samples: {len(docs)} | New subset: {len(docs_5pct)}")



✅ Saved first 50% of training data to: /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-50pct.spacy
Original samples: 1506 | New subset: 753
✅ Saved first 20% of training data to: /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-20pct.spacy
Original samples: 1506 | New subset: 301
✅ Saved first 5% of training data to: /home/jesse/Projects/myprojs/Master_Thesis/Fengdi_Huang_Master_Thesis_Repo/Corpus/train/lv_lvtb-ud-train-5pct.spacy
Original samples: 1506 | New subset: 75
