### Unique word count

In [7]:
import pandas as pd
import glob
import os

# Helped function
def read_files(pattern):
    rows = []
    for path in glob.glob(pattern):
        file_id = os.path.splitext(os.path.basename(path))[0].split("_")[-1]
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                rows.append({
                    "id": file_id,
                    "line": line.strip()
                })
    return pd.DataFrame(rows)

def explode_words(df, text_col="line"):
    return (
        df.assign(word=df[text_col].str.split())
          .explode("word")
          .dropna(subset=["word"])
    )

# Count and print word counts
gloss_df = read_files("../txt/lsl_glosses_*.txt")
sent_df  = read_files("../txt/latvian_sentences_*.txt")

gloss_unique_words = explode_words(gloss_df)["word"].nunique()
sent_unique_words  = explode_words(sent_df)["word"].nunique()

gloss_total_words = explode_words(gloss_df).shape[0]
sent_total_words  = explode_words(sent_df).shape[0]

print(f"Unique gloss words: {gloss_unique_words}")
print(f"Unique sentence words: {sent_unique_words}")
print(f"Total gloss words: {gloss_total_words}")
print(f"Total sentence words: {sent_total_words}")

# Compare line count
gloss_lines = gloss_df.groupby("id").size().rename("gloss_lines")
sent_lines  = sent_df.groupby("id").size().rename("sentence_lines")

line_check = (
    pd.concat([gloss_lines, sent_lines], axis=1)
      .fillna(0)
      .astype(int)
)

line_check["match"] = line_check["gloss_lines"] == line_check["sentence_lines"]

print("\nLine count comparison per ID:")
print(line_check)

# Print total line count
total_gloss_lines = len(gloss_df)
total_sentence_lines = len(sent_df)

if total_gloss_lines == total_sentence_lines:
    print(f"\n✅ Total lines: {total_gloss_lines}")
else:
    print(f"\n❌ Sentence lines: {total_sentence_lines}, gloss lines: {total_gloss_lines}")


Unique gloss words: 960
Unique sentence words: 4196
Total gloss words: 9177
Total sentence words: 8926

Line count comparison per ID:
    gloss_lines  sentence_lines  match
id                                    
1           359             359   True
2           446             446   True
3           159             159   True
4           439             439   True
5           191             191   True
6           319             319   True
7             8               8   True
8             9               9   True
9            70              70   True

✅ Total lines: 2000


### Check sentences

In [26]:
import glob
import re
from pathlib import Path

LV_PATTERN = "../txt/latvian_sentences_*.txt"
GLOSS_PATTERN = "../txt/lsl_glosses_*.txt"

def tokenize(text):
    """
    Lowercase and split into word tokens.
    Keeps letters, digits, and brackets like [NAME].
    """
    return re.findall(r"\b[\w\[\]]+\b", text.lower())

lv_files = sorted(glob.glob(LV_PATTERN))
gloss_files = sorted(glob.glob(GLOSS_PATTERN))

assert len(lv_files) == len(gloss_files), "Mismatch in number of sentence and gloss files"

mismatches = []

for lv_path, gloss_path in zip(lv_files, gloss_files):
    with open(lv_path, "r", encoding="utf-8") as lv_f, \
         open(gloss_path, "r", encoding="utf-8") as gloss_f:

        lv_lines = lv_f.readlines()
        gloss_lines = gloss_f.readlines()

    assert len(lv_lines) == len(gloss_lines), (
        f"Line count mismatch in {Path(lv_path).name} and {Path(gloss_path).name}"
    )

    for idx, (lv_line, gloss_line) in enumerate(zip(lv_lines, gloss_lines), start=1):
        lv_tokens = tokenize(lv_line)
        gloss_tokens = tokenize(gloss_line)

        lv_ir_count = lv_tokens.count("mums")
        gloss_ir_count = gloss_tokens.count("mums")

        if lv_ir_count != gloss_ir_count:
            mismatches.append({
                "file": Path(lv_path).name,
                "line_number": idx,
                "sentence": lv_line.strip(),
                "gloss": gloss_line.strip(),
                "sentence_ir_count": lv_ir_count,
                "gloss_ir_count": gloss_ir_count,
            })

print(f"Total mismatches found: {len(mismatches)}")

for m in mismatches[:10]:
    print("-" * 60)
    print(f"File: {m['file']} | Line: {m['line_number']}")
    print(f"Sentence ({m['sentence_ir_count']}): {m['sentence']}")
    print(f"Gloss    ({m['gloss_ir_count']}): {m['gloss']}")

Total mismatches found: 1
------------------------------------------------------------
File: latvian_sentences_4.txt | Line: 143
Sentence (1): Pavasarī mums ir prieks dāvāt ziedus un skaistus vārdus mūsu tuvajiem cilvēkiem.
Gloss    (2): pavasaris mums ir prieks dāvināt zieds un skaists vārds mums tuvs cilvēki


In [None]:
import glob
import re
from pathlib import Path

LV_PATTERN = "../txt/latvian_sentences_*.txt"
GLOSS_PATTERN = "../txt/lsl_glosses_*.txt"

def tokenize(text):
    return re.findall(r"\b[\w\[\]]+\b", text.lower())

lv_files = sorted(glob.glob(LV_PATTERN))
gloss_files = sorted(glob.glob(GLOSS_PATTERN))

assert len(lv_files) == len(gloss_files), "Mismatch in number of sentence and gloss files"

# TARGET_FORMS = {"esmu", "esam", "esi"}
# GLOSS_FORM = "esmu"

# TARGET_FORMS = {"labs", "labi", "laba"}
# GLOSS_FORM = "labs"

# TARGET_FORMS = {"tas", "tur", "turieni", "tā", "šī", "šis", "to"}
# GLOSS_FORM = "tas"

TARGET_FORMS = {"labs", "labi", "laba"}
GLOSS_FORM = "labs"

mismatches = []

for lv_path, gloss_path in zip(lv_files, gloss_files):
    with open(lv_path, "r", encoding="utf-8") as lv_f, \
         open(gloss_path, "r", encoding="utf-8") as gloss_f:

        lv_lines = lv_f.readlines()
        gloss_lines = gloss_f.readlines()

    assert len(lv_lines) == len(gloss_lines), (
        f"Line count mismatch in {Path(lv_path).name} and {Path(gloss_path).name}"
    )

    for idx, (lv_line, gloss_line) in enumerate(zip(lv_lines, gloss_lines), start=1):
        lv_tokens = tokenize(lv_line)
        gloss_tokens = tokenize(gloss_line)

        sentence_count = sum(lv_tokens.count(form) for form in TARGET_FORMS)
        gloss_count = gloss_tokens.count(GLOSS_FORM)

        if sentence_count > 0 and sentence_count != gloss_count:
            mismatches.append({
                "file": Path(lv_path).name,
                "line_number": idx,
                "sentence": lv_line.strip(),
                "gloss": gloss_line.strip(),
                "sentence_count": sentence_count,
                "gloss_count": gloss_count,
            })

print(f"Total mismatches found: {len(mismatches)}")

for m in mismatches[:10]:
    print("-" * 60)
    print(f"File: {m['file']} | Line: {m['line_number']}")
    print(f"Sentence ({m['sentence_count']}): {m['sentence']}")
    print(f"Gloss    ({m['gloss_count']}): {m['gloss']}")


Total mismatches found: 1
------------------------------------------------------------
File: latvian_sentences_1.txt | Line: 88
Sentence (1): Es jūtos labi.
Gloss    (0): es justies slikts


### Counts gloss instances of the file

In [6]:
import pandas as pd
from glob import glob

# Read all gloss files
words = []
for path in glob("../txt/lsl_glosses_*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        words.extend(f.read().split())

df = pd.DataFrame(words, columns=["word"])

word_counts = df["word"].value_counts().reset_index()
word_counts.columns = ["word", "count"]

filtered = word_counts[word_counts["count"] <= 1]
filtered = filtered.sort_values(by="word")

print(filtered["word"].to_string(index=False))
print("\nTotal words printed:", len(filtered))


    aizpildīt
      aizstāt
           ak
       apkārt
     apraksts
     apvienot
       armija
     atbalsts
  atgriezties
        atkal
       atpūta
    attīstība
      atļauja
    atšķirība
      avokado
        avīze
        burti
         cept
       cukurs
      cēlonis
       dalība
       dators
     deficīts
        dejot
     depozīts
        dievs
       drīkst
        dzeja
      dziedāt
      dziesma
       dāvana
    ekonomika
   emocionāls
   evakuācija
         eļļa
      fizisks
     francija
      godināt
   greipfrūts
      gruzija
      grāmata
        ieeja
       indīgs
    infekcija
      ingvers
  instrukcija
  interesants
    internets
      izdomāt
      izmaksa
      izmisis
  izvairīties
       jebkur
         joks
jēzus_kristus
       kafija
        kalns
        kauls
         kaut
        klase
       kleita
      klimats
     klātiene
 komunikācija
     koncerts
    konflikts
     konkrēts
     kontrole
       kopējs
       krasts
     krievija
      