# Python Translineação – Fonética

Notebook para aplicar regras de regex em palavras, focado em fonética.

## Instalar bibliotecas necessárias (se necessário)

Remova os `#` abaixo se precisar instalar os pacotes.

In [1]:
# !pip install pandas
# !pip install regex
# !pip install odfpy

## Imports

In [13]:
import pandas as pd
import regex as re  # Use the regex module for variable-width look-behinds
from IPython.display import display

## Carregar Coluna para Lista

In [3]:
def load_column_to_list(file_path, sheet_name):
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
        column_data = df[0].dropna().tolist()
        return column_data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

## Carregar Regras Regex para Dicionário

In [4]:
def load_regex_to_dict(file_path, sheet_name):
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
        regex_dict = {str(k): {"replacement": str(v), "id": str(i)} for k, v, i in zip(df[0], df[1], df[2])}
        return regex_dict
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

## Aplicar os Padrões às Palavras

In [None]:
def apply_patterns_to_words(words_list, regex_dict, log_file_path="../Debug/debug_fonetica_notebook.txt"):
    results = []
    with open(log_file_path, "w", encoding="utf-8") as log_file:
        for word in words_list:
            processed_word = word
            used_ids = []
            for pattern, data in regex_dict.items():
                replacement = data["replacement"]
                rule_id = data["id"]
                try:
                    if replacement == "nan":
                        replacement = ""
                    new_word = re.sub(pattern, replacement, processed_word)
                    if new_word != processed_word:
                        used_ids.append(rule_id)
                        log_file.write(f"Origin Word: {processed_word} -> Processed Word: {new_word}\n")
                        log_file.write(f"Pattern Applied: {pattern}->{replacement} (Pattern id: {rule_id})\n\n")
                    processed_word = new_word
                except re.error as e:
                    print(f"Regex error for pattern '{pattern}': {e}")
            results.append((processed_word, used_ids))
    return results

## Definição dos Ficheiros

In [None]:
file_path = "../ODS/Fonetica.ods"
folha_palavras = "Palavras"
folha_expressoes = "ExpressoesFonetica"

## Carregar Dados

In [7]:
words_list = load_column_to_list(file_path, folha_palavras)
regex_dict = load_regex_to_dict(file_path, folha_expressoes)

In [8]:
print(words_list)
print(regex_dict)

['ˈalto', 'amaˈrelo', 'aˈmigo', 'anˈtigo', 'aˈssado', 'aˈzul', 'ˈbaixo', 'ˈbelo', 'ˈbom', 'boˈnito', 'ˈbranco', 'caˈpaz', 'ˈcaro', 'casˈtanho', 'ˈcheio', 'chiˈnês', 'ˈclaro', 'comˈprido', 'conˈtente', 'difeˈrente', 'diˈfícil', 'ˈdoce', 'eˈnorme', 'esˈcuro', 'esˈtranho', 'ˈfácil', 'faˈmoso', 'feˈliz', 'ˈfino', 'ˈforte', 'ˈfresco', 'ˈfrio', 'ˈgrande', 'iˈgual', 'imporˈtante', 'ˈjunto', 'ˈleve', 'ˈlimpo', 'ˈlindo', 'ˈlivre', 'ˈlongo', 'ˈmau', 'natuˈral', 'ˈnegro', 'norˈmal', 'ˈnovo', 'obriˈgado', 'ˈótimo', 'paˈssado', 'peˈqueno', 'ˈpobre', 'portuˈguês', 'poˈssível', 'preˈciso', 'ˈpreto', 'princiˈpal', 'ˈpronto', 'ˈquente', 'ˈrico', 'ˈseco', 'simˈpático', 'ˈsimples', 'soˈzinho', 'ˈtriste', 'ˈúltimo', 'ˈvelho', 'verˈmelho', 'ˈvivo', 'aˈgora', 'aˈí', 'aˈinda', 'aˈli', 'amaˈnhã', 'aˈpenas', 'aˈqui', 'aˈssim', 'aˈté', 'basˈtante', 'ˈbem', 'ˈcá', 'ˈcedo', 'demasiˈado', 'deˈpois', 'deˈpressa', 'emˈbora', 'enˈtão', 'entreˈtanto', 'finalˈmente', 'geralˈmente', 'ˈhoje', 'iˈmenso', 'ˈjá', 'ˈlá', 'ˈl

## Processar Palavras com os Padrões

In [9]:
if words_list and regex_dict:
    processed_data = apply_patterns_to_words(words_list, regex_dict)
    results_for_df = []
    for original, (processed, ids_used) in zip(words_list, processed_data):
        results_for_df.append({
            "Original": original,
            "Processado": processed,
            "IDs usados": ", ".join(ids_used) if ids_used else ""
        })
    df_resultados = pd.DataFrame(results_for_df)

## Resultados

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 0):
    display(df_resultados)

Unnamed: 0,Original,Processado,IDs usados
0,ˈalto,[ˈaɫtu],"T6, T44, T89, T90"
1,amaˈrelo,[ɐmɐˈɾ/E/lu],"T4, T6, T47, T81, T87, T89, T90"
2,aˈmigo,[ɐˈmiɣu],"T6, T47, T60, T89, T90"
3,anˈtigo,[ɐ̃ˈtiɣu],"T6, T16, T60, T89, T90"
4,aˈssado,[ɐˈsaðu],"T6, T47, T56, T67, T89, T90"
5,aˈzul,[ɐˈzuɫ],"T45, T47, T89, T90"
6,ˈbaixo,[ˈbajʃu],"T6, T28, T53, T89, T90"
7,ˈbelo,[ˈb/E/lu],"T6, T87, T89, T90"
8,ˈbom,[ˈbõ],"T20, T89, T90"
9,boˈnito,[buˈnitu],"T3, T6, T89, T90"
