In [2]:
import glob
import os
import json
import re

# --- Configuration ---
TEXT_DATA_PATH = "../txt/*.txt"
OUTPUT_FILE = "../models/mt5-pruned/words_list.json"

def create_word_list():
    unique_words = set()
    
    # Find all .txt files
    files = glob.glob(TEXT_DATA_PATH)
    print(f"Scanning {len(files)} files...")

    for file_path in files:
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                # 1. Lowercase to ensure 'Māja' and 'māja' are treated the same
                line = line.strip().lower()
                
                # 2. Use regex to find words (ignores punctuation)
                # This keeps [NAME] as a single unit if it's in the file
                words = re.findall(r"\[?\b\w+\b\]?", line)
                
                unique_words.update(words)

    # Convert to sorted list
    sorted_words = sorted(list(unique_words))
    
    # Save as JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as j:
        json.dump(sorted_words, j, ensure_ascii=False, indent=4)
    
    print(f"--- Done! ---")
    print(f"Total unique words found: {len(sorted_words)}")
    print(f"List saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    create_word_list()

Scanning 18 files...
--- Done! ---
Total unique words found: 3471
List saved to: ../models/mt5-pruned/words_list.json
