In [1]:
import nltk
import os
from nltk.corpus import cmudict

# Download CMU Pronouncing Dictionary
nltk.download('cmudict')
pronouncing_dict = cmudict.dict()

def get_phonemes(word):
    """Retrieve all phoneme representations of a word from cmudict."""
    return pronouncing_dict.get(word.lower(), [])

def find_homophones_from_files(file_word_map):
    """Find homophones and track their source files."""
    phoneme_map = {}
    word_file_map = {}

    # Map phoneme sequences to words and track file origins
    for file, words in file_word_map.items():
        for word in words:
            phonemes = get_phonemes(word)
            for phoneme in phonemes:
                phoneme_tuple = tuple(phoneme)
                if phoneme_tuple not in phoneme_map:
                    phoneme_map[phoneme_tuple] = set()
                phoneme_map[phoneme_tuple].add(word.lower())  # Store words in lowercase for uniformity
                word_file_map[word.lower()] = file  # Track the file source
    
    # Find homophones and their source files
    homophone_pairs = set()
    for file, words in file_word_map.items():
        for word in words:
            word_lower = word.lower()
            phonemes = get_phonemes(word)
            for phoneme in phonemes:
                phoneme_tuple = tuple(phoneme)
                homophones = phoneme_map.get(phoneme_tuple, set())
                for homophone in homophones:
                    if homophone != word_lower and (homophone, word_lower) not in homophone_pairs:
                        homophone_pairs.add((word_lower, homophone, file))
    
    return sorted(homophone_pairs)

def process_files_in_folder(folder_path):
    """Read all files in a folder, track word origins, and find homophones."""
    file_word_map = {}
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()
                words = set(text.split())  # Avoid duplicates in the same file
                file_word_map[filename] = words
    
    return find_homophones_from_files(file_word_map)

# Example usage
# folder_path = "ouluvs2"  # Folder in the same directory as the script
folder_path = "../Test_Sets/lrs2/main"
homophone_pairs = process_files_in_folder(folder_path)

# Print results
print("\nWord -> Homophone | Source File:")
for word, homophone, word_file in homophone_pairs:
    print(f"{word} -> {homophone} | {word_file}")

[nltk_data] Downloading package cmudict to /home/jupyter-
[nltk_data]     samantha_caasi@dls-bf571/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!



Word -> Homophone | Source File:
are -> our | s15
are -> our | s26
are -> our | s30
are -> our | s52
are -> our | s6
are -> our | transcript_digit_phrase
ate -> eight | s52
ate -> eight | s9
buy -> by | s51
by -> buy | s9
eight -> ate | s9
for -> four | s15
for -> four | s26
for -> four | s30
for -> four | s43
for -> four | s44
for -> four | s52
for -> four | s8
for -> four | s9
four -> for | s52
four -> for | s9
our -> are | s30
our -> are | s49
to -> too | s15
to -> too | s34
to -> too | s43
to -> too | s44
to -> too | s49
to -> too | s51
to -> too | s52
to -> too | s6
to -> too | s8
to -> too | s9
to -> too | transcript_digit_phrase
too -> to | s9
