### Unique word count

In [3]:
import pandas as pd
import glob
import os

# Helped function
def read_files(pattern):
    rows = []
    for path in glob.glob(pattern):
        file_id = os.path.splitext(os.path.basename(path))[0].split("_")[-1]
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                rows.append({
                    "id": file_id,
                    "line": line.strip()
                })
    return pd.DataFrame(rows)

def explode_words(df, text_col="line"):
    return (
        df.assign(word=df[text_col].str.split())
          .explode("word")
          .dropna(subset=["word"])
    )

# Count and print word counts
gloss_df = read_files("../txt/lsl_glosses_*.txt")
sent_df  = read_files("../txt/latvian_sentences_*.txt")

gloss_unique_words = explode_words(gloss_df)["word"].nunique()
sent_unique_words  = explode_words(sent_df)["word"].nunique()

gloss_total_words = explode_words(gloss_df).shape[0]
sent_total_words  = explode_words(sent_df).shape[0]

print(f"Unique gloss words: {gloss_unique_words}")
print(f"Unique sentence words: {sent_unique_words}")
print(f"Total gloss words: {gloss_total_words}")
print(f"Total sentence words: {sent_total_words}")

# Compare line count
gloss_lines = gloss_df.groupby("id").size().rename("gloss_lines")
sent_lines  = sent_df.groupby("id").size().rename("sentence_lines")

line_check = (
    pd.concat([gloss_lines, sent_lines], axis=1)
      .fillna(0)
      .astype(int)
)

line_check["match"] = line_check["gloss_lines"] == line_check["sentence_lines"]

print("\nLine count comparison per ID:")
print(line_check)

# Print total line count
total_gloss_lines = len(gloss_df)
total_sentence_lines = len(sent_df)

if total_gloss_lines == total_sentence_lines:
    print(f"\n✅ Total lines: {total_gloss_lines}")
else:
    print(f"\n❌ Sentence lines: {total_sentence_lines}, gloss lines: {total_gloss_lines}")


Unique gloss words: 618
Unique sentence words: 2435
Total gloss words: 5194
Total sentence words: 5200

Line count comparison per ID:
    gloss_lines  sentence_lines  match
id                                    
1           286             286   True
2           437             437   True
3           154             154   True
4           386             386   True
5             8               8   True
6           110             110   True
7             5               5   True

✅ Total lines: 1386


### Counts gloss instances of the file

In [4]:
import pandas as pd
from glob import glob

# Read all gloss files
words = []
for path in glob("../txt/lsl_glosses_*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        words.extend(f.read().split())

df = pd.DataFrame(words, columns=["word"])

word_counts = df["word"].value_counts().reset_index()
word_counts.columns = ["word", "count"]

filtered = word_counts[word_counts["count"] <= 1]

print(filtered.to_string(index=False))

        word  count
      kreiss      1
 svārstīties      1
    devītais      1
      krūtis      1
     paklupt      1
        zars      1
       kakls      1
        mala      1
      skaits      1
        seja      1
    gadījums      1
       sauss      1
        egle      1
       celis      1
        kāja      1
     trūkums      1
   izrakstīt      1
   republika      1
     piliens      1
proklamācija      1
     plaušas      1
   skatīties      1
   pieraksts      1
     ķirurgs      1
  asociācija      1
  pieslēgums      1
     klimats      1
        jūra      1
       glābt      1
      krasts      1
     kādreiz      1
          ne      1
      neilgs      1
    pieejams      1
        joks      1
         ...      1
    atkārtot      1
       vidus      1
     izmaksa      1
          ja      1
       konts      1
     izmisis      1
   situācija      1
zīmju_valoda      1
    ražošana      1
         900      1
      stress      1
  emocionāls      1
        reti      1
