In [6]:
import os
import re

In [11]:
# 1. Define our common CPS phone set (lowercase)
cps_set = {
    'a', 'aa', 'i', 'ii', 'u', 'uu', 'e', 'ee',
    'k', 'kh', 'g', 'gh', 'c', 'ch', 'j', 'jh',
    'tx', 'txh', 'dx', 'dxh', 't', 'th', 'd', 'dh',
    'p', 'ph', 'b', 'bh', 'm', 'y', 'r', 'l', 'w',
    'sh', 's', 'h', 'kq', 'gq', 'z', 'jhq', 'dxq',
    'dxhq', 'f', 'ss', 'dd', 'dh2', 'dz', 'ai', 'hh', 'awu'
}


In [8]:
# 1. Load IPA → ARPAbet mapping (invert the ARPAbet→IPA map)
ipa2arp = {}
with open("tools/ipa_arpabet_map.txt", encoding="utf-8") as mfile:
    for line in mfile:
        line = line.strip()
        if not line or "\t" not in line:
            continue
        arpabet_sym, ipa_esc = line.split("\t", 1)
        # Decode unicode escapes to get the actual IPA string
        ipa_chars = ipa_esc.encode("utf-8").decode("unicode_escape")
        # Optionally strip stress digits from ARPAbet symbol:
        base_arp = re.sub(r"\d+$", "", arpabet_sym)
        # Map each IPA sequence to its base ARPAbet symbol
        ipa2arp[ipa_chars] = base_arp

def ipa_to_arp(tokens):
    """Convert list of IPA tokens to ARPAbet symbols via direct lookup."""
    return [ipa2arp.get(tok, "UNK") for tok in tokens]


In [9]:
# 2. Process raw broad IPA files for Arabic, Persian, Urdu
lang_codes = {"ara": "arabic", "fas": "persian", "urd": "urdu"}
os.makedirs("data/gold", exist_ok=True)

for code, lang in lang_codes.items():
    in_path = f"data/raw/{code}_arab_broad.tsv"
    out_path = f"data/gold/{lang}_manual.tsv"
    
    with open(in_path, encoding="utf-8") as fin, open(out_path, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            word, ipa_seq = line.split("\t", 1)
            tokens = ipa_seq.split()
            arpabets = ipa_to_arp(tokens)
            fout.write(f"{word}\t{' '.join(arpabets)}\n")

print("Gold-standard ARPAbet lexicons generated in data/gold/")


Gold-standard ARPAbet lexicons generated in data/gold/


In [13]:
# 2. Load IPA → ARPAbet mapping (invert ARPAbet->IPA)
ipa2arp = {}
with open("tools/ipa_arpabet_map.txt", encoding="utf-8") as mfile:
    for line in mfile:
        line = line.strip()
        if not line or "\t" not in line:
            continue
        arpabet_sym, ipa_esc = line.split("\t", 1)
        base_arp = re.sub(r"\d+$", "", arpabet_sym).lower()
        ipa_seq = ipa_esc.encode("utf-8").decode("unicode_escape")
        ipa2arp[ipa_seq] = base_arp

# 3. Extended fallback mapping for uncovered IPA symbols
fallback_map = {
    # Arabic extras
    'a': 'aa', 'aː': 'aa', 'bː': 'b', 'dˤ': 'dh2', 'd̪ˤ': 'dh2',
    'd͡ʒ': 'jh', 'e': 'eh', 'eː': 'ey', 'l': 'l', 'o': 'ow', 'oː': 'ow',
    'q': 'q', 'r': 'r', 'sˤ': 'ss', 'tˀ': 't', 'tˤ': 't', 'uː': 'uw',
    'æː': 'ae', 'ðˤ': 'dh2', 'ō': 'ow', 'ɐ': 'ax', 'ɣ': 'gq',
    'ɮˤ': 'l', 'ʁ': 'r', 'ʕ': 'ai', 'χ': 'kh', '‿': '',

    # Persian extras
    ',': '', 'dː': 'd', 'd̪': 'd', 'i̯': 'y', 'kʰ': 'kh', 'kʲ': 'k',
    'sː': 's', 'tʰ': 'th', 'tː': 't', 't̪': 't', 't͡ʃ': 'ch', 't͡ʃʰ': 'ch',
    '~': '', 'ɒː': 'ao', 'ɔː': 'ao', 'ɡʱ': 'g', 'ɡʷ': 'g', 'ɢ': 'q',
    'ɵ': 'uh', 'ʃʰ': 'sh', 'ʃː': 'sh', 'ʊ̯': 'w', 'β': 'b',

    # Urdu extras
    'bʱ': 'bh', 'bː': 'b', 'bᵊ': 'b', 'd̪ʱ': 'dh', 'd̪ː': 'd',
    'd̪ːʰ': 'dh', 'd̪̪': 'd', 'd̪ᵊ': 'd', 'd͡z': 'z', 'd͡ʒʱ': 'jh',
    'd͡ʒː': 'jh', 'd͡ʒᵊ': 'jh', 'eʱ': 'eh', 'jː': 'y', 'jᵊ': 'y',
    'kː': 'k', 'lː': 'l', 'l̪': 'l', 'mʱ': 'm', 'mː': 'm', 'mᵊ': 'm',
    'nʱ': 'n', 'nː': 'n', 'nᵊ': 'n', 'pʰ': 'ph', 'pː': 'p', 'qː': 'q',
    'rː': 'r', 't̪ʰ': 'th', 't̪ː': 't', 't̪̤': 't', 't͡ʃʰᵊ': 'ch',
    't͡ʃːʰ': 'ch', 'xʷ': 'kh', 'xː': 'kh', 'zː': 'z', 'z̥': 'z',
    'ä': 'ae', 'õ': 'ow', 'õː': 'ow', 'ē': 'ey', 'ĩː': 'iy', 'ũː': 'uw',
    'ɑ̃ː': 'aa', 'ɑ̃ːᵑ': 'aa', 'ɔ̃ː': 'ao', 'ə̃': 'ax', 'ə̯': 'ax',
    'əᵊ': 'ax', 'ɛː': 'eh', 'ɛ̃ː': 'eh', 'ẽː': 'eh', '◌̃': '',
    'ɖ': 'dx', 'ɖʱ': 'dxh', 'ɖː': 'dx', 'ɦ': 'h', 'ɪ̃': 'ih',
    'ɪ̯': 'y', 'ɲ': 'n', 'ɳ': 'n', 'ɽ': 'r', 'ɽʱ': 'r', 'ɾᵊ': 'r',
    'ʈ': 'tx', 'ʈʰ': 'txh', 'ʈʱ': 'txh', 'ʈː': 'tx', 'ʊ̃': 'uh',
    'ʋ': 'v', 'ʋː': 'v', 'ʋᵊ': 'v'
}

# Merge fallback into ipa2arp (without overwriting existing)
for ipa_seq, arp_sym in fallback_map.items():
    ipa2arp.setdefault(ipa_seq, arp_sym)

# 4. Conversion function
def ipa_to_arpabets(tokens, unknowns):
    arpabets = []
    for tok in tokens:
        arp = ipa2arp.get(tok)
        if not arp:
            unknowns.add(tok)
            arp = 'unk'
        arpabets.append(arp)
    return arpabets

# 5. Process files
lang_codes = {'ara': 'arabic', 'fas': 'persian', 'urd': 'urdu'}
os.makedirs("data/gold", exist_ok=True)

for code, lang in lang_codes.items():
    in_path = f"data/raw/{code}_arab_broad.tsv"
    out_path = f"data/gold/{lang}_manual.tsv"
    unknowns = set()
    
    with open(in_path, encoding="utf-8") as fin, open(out_path, "w", encoding="utf-8") as fout:
        for line in fin:
            word, ipa_seq = line.strip().split("\t", 1)
            tokens = ipa_seq.split()
            arpabets = ipa_to_arpabets(tokens, unknowns)
            fout.write(f"{word}\t{' '.join(arpabets)}\n")
    
    if unknowns:
        print(f"[{lang}] still unknown IPA tokens: {sorted(unknowns)}")
    # Check any ARPAbet not in CPS
    unexpected = set()
    with open(out_path) as fcheck:
        for lw in fcheck:
            _, seq = lw.strip().split("\t")
            for arp in seq.split():
                if arp not in cps_set:
                    unexpected.add(arp)
    if unexpected:
        print(f"[{lang}] ARPABET symbols outside CPS set: {sorted(unexpected)}")

print("Mapping complete; gold lexicons in data/gold/")


[arabic] still unknown IPA tokens: ['i', 'x', 'ħ', 'ɑ', 'ɒ', 'ʔ', '‿']
[arabic] ARPABET symbols outside CPS set: ['ae', 'ah', 'ax', 'eh', 'ey', 'ih', 'iy', 'n', 'ow', 'q', 'uh', 'unk', 'uw', 'v', 'zh']
[persian] still unknown IPA tokens: [',', 'i', 'x', '~', 'ɒ', 'ɹ', 'ɾ', 'ʔ']
[persian] ARPABET symbols outside CPS set: ['ae', 'ah', 'ao', 'eh', 'ey', 'ih', 'iy', 'n', 'ow', 'q', 'uh', 'unk', 'uw', 'v', 'zh']
[urdu] still unknown IPA tokens: ['i', 'x', '~', 'ɑ', 'ɒ', 'ɡː', 'ɾ', 'ʔ', 'ʰ', 'ʱ', '◌̃']
[urdu] ARPABET symbols outside CPS set: ['ae', 'ah', 'ao', 'ax', 'eh', 'ey', 'ih', 'iy', 'n', 'ng', 'ow', 'q', 'uh', 'unk', 'uw', 'v', 'zh']
Mapping complete; gold lexicons in data/gold/
