### Unique word count

In [13]:
import pandas as pd
import glob
import os

# Helped function
def read_files(pattern):
    rows = []
    for path in glob.glob(pattern):
        file_id = os.path.splitext(os.path.basename(path))[0].split("_")[-1]
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                rows.append({
                    "id": file_id,
                    "line": line.strip()
                })
    return pd.DataFrame(rows)

def explode_words(df, text_col="line"):
    return (
        df.assign(word=df[text_col].str.split())
          .explode("word")
          .dropna(subset=["word"])
    )

# Count and print word counts
gloss_df = read_files("../txt/lsl_glosses_*.txt")
sent_df  = read_files("../txt/latvian_sentences_*.txt")

gloss_unique_words = explode_words(gloss_df)["word"].nunique()
sent_unique_words  = explode_words(sent_df)["word"].nunique()

gloss_total_words = explode_words(gloss_df).shape[0]
sent_total_words  = explode_words(sent_df).shape[0]

print(f"Unique gloss words: {gloss_unique_words}")
print(f"Unique sentence words: {sent_unique_words}")
print(f"Total gloss words: {gloss_total_words}")
print(f"Total sentence words: {sent_total_words}")

# Compare line count
gloss_lines = gloss_df.groupby("id").size().rename("gloss_lines")
sent_lines  = sent_df.groupby("id").size().rename("sentence_lines")

line_check = (
    pd.concat([gloss_lines, sent_lines], axis=1)
      .fillna(0)
      .astype(int)
)

line_check["match"] = line_check["gloss_lines"] == line_check["sentence_lines"]

print("\nLine count comparison per ID:")
print(line_check)


Unique gloss words: 292
Unique sentence words: 1099
Total gloss words: 2317
Total sentence words: 2318

Line count comparison per ID:
    gloss_lines  sentence_lines  match
id                                    
1           241             241   True
2           309             309   True
3            85              85   True


### Counts gloss instances of the file

In [3]:
import pandas as pd
from glob import glob

# Read all gloss files
words = []
for path in glob("../txt/lsl_glosses_*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        words.extend(f.read().split())

df = pd.DataFrame(words, columns=["word"])

word_counts = df["word"].value_counts().reset_index()
word_counts.columns = ["word", "count"]

filtered = word_counts[word_counts["count"] <= 1]

print(filtered.to_string(index=False))

     word  count
     lēts      1
 darbnīca      1
     reti      1
     kurš      1
    jauns      1
  lielums      1
     kopt      1
  violets      1
    melns      1
 produkts      1
    glīts      1
      īss      1
diennakts      1
       20      1
 izmantot      1
