In [1]:
import os

os.chdir("..")
os.chdir("..")
os.chdir("..")


In [140]:
import tkinter as tk

class WordBaseApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Base de Palavras e Frases")
        self.geometry("600x600")

        self.current_type = "words"
        self.data = {
            "words": {
                "Casa": {
                    "Partes": ["porta", "janela", "telhado"],
                    "Cômodos": ["sala", "quarto", "cozinha"]
                },
                "Trabalho": {
                    "Profissões": ["engenheiro", "professor"]
                }
            },
            "phrases": {
                "Saudações": {
                    "Informal": ["Oi!", "E aí?"],
                    "Formal": ["Bom dia", "Como vai?"]
                }
            }
        }

        self.build_ui()

    def build_ui(self):
        # Limpa tudo
        for widget in self.winfo_children():
            widget.destroy()

        # Topo: Botões de tipo
        top_frame = tk.Frame(self)
        top_frame.pack(pady=10)

        words_btn = tk.Button(top_frame, text="Palavras", command=lambda: self.switch_type("words"))
        words_btn.pack(side="left", padx=5)

        phrases_btn = tk.Button(top_frame, text="Frases", command=lambda: self.switch_type("phrases"))
        phrases_btn.pack(side="left", padx=5)

        # Área de conteúdo
        self.content_frame = tk.Frame(self)
        self.content_frame.pack(fill="both", expand=True, pady=10)

        self.draw_categories()

    def switch_type(self, type_name):
        self.current_type = type_name
        self.draw_categories()

    def draw_categories(self):
        for widget in self.content_frame.winfo_children():
            widget.destroy()

        categories = self.data.get(self.current_type, {})

        for category, subcats in categories.items():
            cat_frame = tk.Frame(self.content_frame)
            cat_frame.pack(fill="x", padx=10, pady=5)

            is_expanded = tk.BooleanVar(value=False)

            def toggle(sub_frame=subcats, var=is_expanded, master=cat_frame, cat_name=category):
                if var.get():
                    for widget in master.winfo_children()[1:]:
                        widget.destroy()
                    var.set(False)
                else:
                    for sub, items in sub_frame.items():
                        sub_btn = tk.Button(
                            master,
                            text=f"   {sub}",
                            anchor="w",
                            command=lambda sub=sub, items=items: self.show_word_list(category=cat_name, subcategory=sub, words=items)
                        )
                        sub_btn.pack(fill="x")
                    var.set(True)

            btn = tk.Button(cat_frame, text=category, anchor="w", command=toggle)
            btn.pack(fill="x")

    def show_word_list(self, category, subcategory, words):
        # Limpa conteúdo
        for widget in self.winfo_children():
            widget.destroy()

        # Botão voltar
        back_btn = tk.Button(self, text="Voltar", command=self.build_ui)
        back_btn.pack(pady=10)

        # Título
        title = tk.Label(self, text=f"{self.current_type.title()} - {category} > {subcategory}", font=("Arial", 16))
        title.pack(pady=5)

        # Lista de palavras
        list_frame = tk.Frame(self)
        list_frame.pack(fill="both", expand=True, pady=10)

        for word in words:
            word_label = tk.Label(list_frame, text=word, anchor="w")
            word_label.pack(fill="x", padx=20, pady=2)

if __name__ == "__main__":
    app = WordBaseApp()
    app.mainloop()


In [141]:
from app.utils.data_loader import DataLoader
from pathlib import Path

def prep_data_match_word():
    """
    Função que prepara os dados para a busca de palavras.
    """
    data_loader_words = DataLoader(base_path="database/extract_data_video/data/extracted_data/words/data_organize")
    data_loader_phrases = DataLoader(base_path="database/extract_data_video/data/extracted_data/phrases/data_organize")

    list_words = data_loader_words.get_all_words()
    list_phrases = data_loader_phrases.get_all_words()

    set_deta_words = []
    for dict_word in list_words+list_phrases:
        path_word = dict_word.get("path", "")
        path_word = Path(path_word)

        # Divide o nome por "_" e converte para minúsculas
        word_parts = path_word.stem.replace("_", " ").lower().split()
        
        for prep_word in word_parts:
            set_deta_words.append({"word":prep_word, "path":path_word})
    
    return set_deta_words

def find_match_word(word, set_deta_words):
    """
    Função que procura por palavras que possuem a tag "tag" no nome.
    """
    word = word.lower()
    
    list_match_word = []
    for data_word in set_deta_words:
        word_name = data_word.get("word", "")

        if word == word_name:
            list_match_word.append(data_word)
    
    return list_match_word
    
    

In [133]:
set_deta_words = prep_data_match_word()
list_match_word = find_match_word("word", set_deta_words)
list_match_word



[{'word': 'word',
  'path': PosixPath('database/extract_data_video/data/extracted_data/words/data_organize/educação/linguística/word')},
 {'word': 'word',
  'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/conversa_geral/fazer_se_compreender/what_does_that_word_mean_in_english')},
 {'word': 'word',
  'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/conversa_geral/fazer_se_compreender/how_do_you_pronounce_this_word')}]

In [131]:
import tkinter as tk
from pathlib import Path
import re

class TextReaderApp:
    def __init__(self, master, known_words: set[str]):
        self.master = master
    
        self.known_words = known_words

        # Widgets
        self.text_widget = tk.Text(master, wrap="word", font=("Arial", 14))
        self.text_widget.pack(expand=True, fill="both", padx=10, pady=10)

        self.send_button = tk.Button(master, text="Enviar", command=self._process_text)
        self.send_button.pack(pady=(0, 5))

        self.send_button = tk.Button(master, text="Criar banco de dados com palavras", command=self._save_data_words)
        self.send_button.pack(pady=(0, 10))

        self.stats_label = tk.Label(master, justify="left", font=("Arial", 12), anchor="w")
        self.stats_label.pack(padx=10, pady=5, anchor="w")

        self.list_data_words_in_text = []
        self._configure_tags()

    def _save_data_words(self):
        print(self.list_data_words_in_text)


    def _configure_tags(self):
        self.text_widget.tag_config("green", foreground="green")
        self.text_widget.tag_config("gray", foreground="gray")

    def _process_text(self):
        self.text_widget.tag_remove("green", "1.0", tk.END)
        self.text_widget.tag_remove("gray", "1.0", tk.END)

        text = self.text_widget.get("1.0", tk.END)
        self.total_words = 0
        self.known_words_count = 0

        self.text_widget.mark_set("insert", "1.0")
        words_with_pos = [(m.group(), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z]+", text)]


        for word, start, end in words_with_pos:
            self.total_words += 1

            list_match_word = find_match_word(word, set_deta_words)
            if list_match_word:
                self.list_data_words_in_text.append(list_match_word)

            tag = "green" if list_match_word else "gray"
            self.text_widget.tag_add(tag, f"1.0+{start}c", f"1.0+{end}c")
            if tag == "green":
                self.known_words_count += 1


        self._show_statistics()

    def _show_statistics(self):
        total = self.total_words
        known = self.known_words_count
        unknown = total - known
        percent = (known / total * 100) if total else 0

        stats_text = (
            f"Total de palavras: {total}\n"
            f"Conhecidas: {known}\n"
            f"Desconhecidas: {unknown}\n"
            f"Compreensão: {percent:.1f}%"
        )
        self.stats_label.config(text=stats_text)


if __name__ == "__main__":
    set_deta_words = prep_data_match_word()
    
    # words_that_i_know = [word.replace("_", " ") for word in words_that_i_know]
    # words_that_i_know = " ".join(words_that_i_know).split(" ")
    # words_that_i_know.extend(["how", "many", "pieces", "you", "retrieve"])

    # words_that_i_know = set(word.lower() for word in words_that_i_know)

    root = tk.Tk()
    root.title("Leitura de Texto")
    app = TextReaderApp(root, set_deta_words)

    # Texto inicial (opcional)
    example_text = """How many-teste pieces you retrieve from your RAG system affects the result."""
    app.text_widget.insert("1.0", example_text)

    root.mainloop()


[[{'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/words/data_organize/advérbios/palavras_interrogativas_e_ligação/how')}, {'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/viagem/no_posto_de_turismo/how_do_we_get_to_the_theater')}, {'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/viagem/no_posto_de_turismo/how_many_stars_does_it_have')}, {'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/viagem/excursões/how_much_does_this_tour_cost')}, {'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/viagem/pedir_indicações/how_long_will_it_take_me_to_walk_there')}, {'word': 'how', 'path': PosixPath('database/extract_data_video/data/extracted_data/phrases/data_organize/viagem/pedir_indicações/how_can_i_get_to_the_beach_from_here')}, {'word': 'how', '