### Unique word count

In [10]:
import pandas as pd
import glob
import os

# Helped function
def read_files(pattern):
    rows = []
    for path in glob.glob(pattern):
        file_id = os.path.splitext(os.path.basename(path))[0].split("_")[-1]
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                rows.append({
                    "id": file_id,
                    "line": line.strip()
                })
    return pd.DataFrame(rows)

def explode_words(df, text_col="line"):
    return (
        df.assign(word=df[text_col].str.split())
          .explode("word")
          .dropna(subset=["word"])
    )

# Count and print word counts
gloss_df = read_files("../txt/lsl_glosses_*.txt")
sent_df  = read_files("../txt/latvian_sentences_*.txt")

gloss_unique_words = explode_words(gloss_df)["word"].nunique()
sent_unique_words  = explode_words(sent_df)["word"].nunique()

gloss_total_words = explode_words(gloss_df).shape[0]
sent_total_words  = explode_words(sent_df).shape[0]

print(f"Unique gloss words: {gloss_unique_words}")
print(f"Unique sentence words: {sent_unique_words}")
print(f"Total gloss words: {gloss_total_words}")
print(f"Total sentence words: {sent_total_words}")

# Compare line count
gloss_lines = gloss_df.groupby("id").size().rename("gloss_lines")
sent_lines  = sent_df.groupby("id").size().rename("sentence_lines")

line_check = (
    pd.concat([gloss_lines, sent_lines], axis=1)
      .fillna(0)
      .astype(int)
)

line_check["match"] = line_check["gloss_lines"] == line_check["sentence_lines"]

print("\nLine count comparison per ID:")
print(line_check)

# Print total line count
total_gloss_lines = len(gloss_df)
total_sentence_lines = len(sent_df)

if total_gloss_lines == total_sentence_lines:
    print(f"\n✅ Total lines: {total_gloss_lines}")
else:
    print(f"\n❌ Sentence lines: {total_sentence_lines}, gloss lines: {total_gloss_lines}")


Unique gloss words: 750
Unique sentence words: 3107
Total gloss words: 6897
Total sentence words: 6734

Line count comparison per ID:
    gloss_lines  sentence_lines  match
id                                    
1           341             341   True
2           443             443   True
3           158             158   True
4           408             408   True
5           175             175   True
6           134             134   True
7             5               5   True
8             9               9   True
9             1               1   True

✅ Total lines: 1674


### Counts gloss instances of the file

In [7]:
import pandas as pd
from glob import glob

# Read all gloss files
words = []
for path in glob("../txt/lsl_glosses_*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        words.extend(f.read().split())

df = pd.DataFrame(words, columns=["word"])

word_counts = df["word"].value_counts().reset_index()
word_counts.columns = ["word", "count"]

filtered = word_counts[word_counts["count"] <= 1]

total_words = len(filtered)

# print(filtered.to_string(index=False))
print(filtered["word"].to_string(index=False))

print("\nTotal words printed:", len(filtered))

         lauki
         sūtīt
  organizācija
        ielikt
         kakls
       pavisam
          seja
       plaušas
       vēstule
       nelaime
       ķirurgs
       atļauja
       priekša
         sauss
       brālēns
     izrakstīt
     pieraksts
       piliens
         slogs
       klimats
    pieslēgums
       paklupt
    pensionārs
      apvienot
      paradums
        cukurs
      kontrole
   piedalīties
       kaitīgs
      tīrīšana
     uzklausīt
        orgāns
     elektrība
    pieklājība
    velosipēds
         ražot
         gulēt
    nevajadzēt
         asins
        kopumā
     steigties
     iekaisums
         grīda
       trūkums
      turpināt
     aizpildīt
     vienkāršs
          kāja
          ērts
        krūtis
         rādīt
         miegs
          tāds
    vingrošana
      pieejams
          cept
          kūka
      pasniegt
        kopējs
        atpūta
      koncerts
       valdība
tīmekļa_vietne
        krasts
        dāvana
          jūra
        sk