### Unique word count

In [17]:
import pandas as pd
import glob
import os

# Helped function
def read_files(pattern):
    rows = []
    for path in glob.glob(pattern):
        file_id = os.path.splitext(os.path.basename(path))[0].split("_")[-1]
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                rows.append({
                    "id": file_id,
                    "line": line.strip()
                })
    return pd.DataFrame(rows)

def explode_words(df, text_col="line"):
    return (
        df.assign(word=df[text_col].str.split())
          .explode("word")
          .dropna(subset=["word"])
    )

# Count and print word counts
gloss_df = read_files("../txt/lsl_glosses_*.txt")
sent_df  = read_files("../txt/latvian_sentences_*.txt")

gloss_unique_words = explode_words(gloss_df)["word"].nunique()
sent_unique_words  = explode_words(sent_df)["word"].nunique()

gloss_total_words = explode_words(gloss_df).shape[0]
sent_total_words  = explode_words(sent_df).shape[0]

print(f"Unique gloss words: {gloss_unique_words}")
print(f"Unique sentence words: {sent_unique_words}")
print(f"Total gloss words: {gloss_total_words}")
print(f"Total sentence words: {sent_total_words}")

# Compare line count
gloss_lines = gloss_df.groupby("id").size().rename("gloss_lines")
sent_lines  = sent_df.groupby("id").size().rename("sentence_lines")

line_check = (
    pd.concat([gloss_lines, sent_lines], axis=1)
      .fillna(0)
      .astype(int)
)

line_check["match"] = line_check["gloss_lines"] == line_check["sentence_lines"]

print("\nLine count comparison per ID:")
print(line_check)

# Print total line count
total_gloss_lines = len(gloss_df)
total_sentence_lines = len(sent_df)

if total_gloss_lines == total_sentence_lines:
    print(f"\n✅ Total lines: {total_gloss_lines}")
else:
    print(f"\n❌ Sentence lines: {total_sentence_lines}, gloss lines: {total_gloss_lines}")


Unique gloss words: 518
Unique sentence words: 2085
Total gloss words: 4331
Total sentence words: 4339

Line count comparison per ID:
    gloss_lines  sentence_lines  match
id                                    
1           260             260   True
2           367             367   True
3           149             149   True
4           376             376   True
5            37              37   True

✅ Total lines: 1189


### Counts gloss instances of the file

In [20]:
import pandas as pd
from glob import glob

# Read all gloss files
words = []
for path in glob("../txt/lsl_glosses_*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        words.extend(f.read().split())

df = pd.DataFrame(words, columns=["word"])

word_counts = df["word"].value_counts().reset_index()
word_counts.columns = ["word", "count"]

filtered = word_counts[word_counts["count"] <= 1]

print(filtered.to_string(index=False))

        word  count
proklamācija      1
        egle      1
  asociācija      1
zīmju_valoda      1
     dažreiz      1
       gadīt      1
       sāpēt      1
       kakls      1
      sasist      1
   skatīties      1
     trūkums      1
   republika      1
        joks      1
     1000000      1
    students      1
        3000      1
   ventspils      1
        2000      1
   uzskatāms      1
       ārsts      1
    pēdējais      1
       vidus      1
 iedzīvotājs      1
    devītais      1
     kurzeme      1
 svārstīties      1
     pareizs      1
   septītais      1
       tulks      1
     vasaras      1
