In [1]:
import nltk
import os
from nltk.corpus import cmudict

# Download CMU Pronouncing Dictionary
nltk.download('cmudict')
pronouncing_dict = cmudict.dict()

def get_phonemes(word):
    """Retrieve all phoneme representations of a word from cmudict."""
    return pronouncing_dict.get(word.lower(), [])

def find_homophones_from_files(file_word_map):
    """Find homophones and track their source files."""
    phoneme_map = {}
    word_file_map = {}

    # Map phoneme sequences to words and track file origins
    for file, words in file_word_map.items():
        for word in words:
            phonemes = get_phonemes(word)
            for phoneme in phonemes:
                phoneme_tuple = tuple(phoneme)
                if phoneme_tuple not in phoneme_map:
                    phoneme_map[phoneme_tuple] = set()
                phoneme_map[phoneme_tuple].add(word.lower())  # Store words in lowercase for uniformity
                word_file_map[word.lower()] = file  # Track the file source
    
    # Find homophones and their source files
    homophone_pairs = set()
    for file, words in file_word_map.items():
        for word in words:
            word_lower = word.lower()
            phonemes = get_phonemes(word)
            for phoneme in phonemes:
                phoneme_tuple = tuple(phoneme)
                homophones = phoneme_map.get(phoneme_tuple, set())
                for homophone in homophones:
                    if homophone != word_lower and (homophone, word_lower) not in homophone_pairs:
                        homophone_pairs.add((word_lower, homophone, file))
    
    return sorted(homophone_pairs)

def process_files_in_folder(main_folder_path):
    """Iterate through each subfolder, read text files, track word origins, and find homophones."""
    file_word_map = {}
    
    for subfolder in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, subfolder)
        if os.path.isdir(subfolder_path):  # Ensure it's a folder
            for filename in os.listdir(subfolder_path):
                if filename.endswith(".txt"):  # Process only text files
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        text = file.read()
                        words = set(text.split())  # Avoid duplicates in the same file
                        file_word_map[file_path] = words
    
    return find_homophones_from_files(file_word_map)

# Example usage
# folder_path = "../Test_Sets/lrs3/"
folder_path = "../face_pose_transformed/lrs3/test"
homophone_pairs = process_files_in_folder(folder_path)

# Print results
print("\nWord -> Homophone | Source File:")
for word, homophone, word_file in homophone_pairs:
    print(f"{word} -> {homophone} | {word_file}")

[nltk_data] Downloading package cmudict to /home/jupyter-
[nltk_data]     samantha_caasi@dls-bf571/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!



Word -> Homophone | Source File:
are -> or | ../face_pose_transformed/lrs3/test/0gks6ceq4eQ_00005/0gks6ceq4eQ_00005.txt
are -> or | ../face_pose_transformed/lrs3/test/0gks6ceq4eQ_00006/0gks6ceq4eQ_00006.txt
are -> or | ../face_pose_transformed/lrs3/test/0gks6ceq4eQ_00009/0gks6ceq4eQ_00009.txt
are -> or | ../face_pose_transformed/lrs3/test/1bnzVjOJ6NM_00011/1bnzVjOJ6NM_00011.txt
are -> or | ../face_pose_transformed/lrs3/test/2SlBiFZ85d0_00006/2SlBiFZ85d0_00006.txt
are -> or | ../face_pose_transformed/lrs3/test/2UStOghblfE_00001/2UStOghblfE_00001.txt
are -> or | ../face_pose_transformed/lrs3/test/3uSQlcGCHUU_00016/3uSQlcGCHUU_00016.txt
are -> or | ../face_pose_transformed/lrs3/test/5Yj3nGv0kn8_00001/5Yj3nGv0kn8_00001.txt
are -> or | ../face_pose_transformed/lrs3/test/66koScWSHBU_00001/66koScWSHBU_00001.txt
are -> or | ../face_pose_transformed/lrs3/test/6ra1MIKlYB0_00002/6ra1MIKlYB0_00002.txt
are -> or | ../face_pose_transformed/lrs3/test/7kkRkhAXZGg_00003/7kkRkhAXZGg_00003.txt
are -> or