In [85]:
import pandas as pd
from nltk.corpus import cmudict
import numpy as np

cmu = cmudict.dict()

misspelling_pronunciations =  pd.read_excel('data/mispelling_pronunciations.xlsx', engine = 'openpyxl')
arpabet_table = pd.read_excel('../../words/ARPAbet/ARPAbet.xlsx', engine = 'openpyxl')
elp = pd.read_csv('../../words/elp/elp_clean.csv')

orth = elp.word.tolist() + misspellings.misspell.tolist()




In [121]:
# fix some missing words in cmudict
cmu['aah'] = [['AA1']]
cmu['abase'] = [['AH0', 'B', 'EY1', 'S']]

In [120]:
cmu['amaze']

[['AH0', 'M', 'EY1', 'Z']]

In [102]:
misspelling_pronunciations_ = {}

for i, row in misspelling_pronunciations.iterrows():
    misspelling_pronunciations_[row.Misspellings] = row.miss_preds_corrected

We use ARPAbet two-letter to ARPAbet one-letter for the cmu transcriptions of our real words.

In [71]:
arpabet = {}

def schwa(x):
    assert isinstance(x, list), "Provide a list for x please"
    assert isinstance(x[0], str), "x should be a list of strings!"
    
    return ['~' if e == 'AH0' else e for e in x]

for i, row in arpabet_table.iterrows():
    if isinstance(row['two-letter'], str):
        arpabet[row['two-letter']] = row['one-letter']
    else:
        print(row['two-letter'], row['one-letter'], '...pair skipped')
        print('Find on row:', i)
        print('Full content of that row:\n', row)
    
    arpabet['~'] = '~'

nan nan ...pair skipped
Find on row: 51
Full content of that row:
 one-letter      NaN
two-letter      NaN
IPA             NaN
klattese        NaN
wpa               Y
Example        spew
Notes           NaN
Sound class     NaN
Name: 51, dtype: object


We will use IPA transcriptions for the transcribed misspelling data.

In [93]:
ipa = {}

for i, row in arpabet_table.iterrows():
    if isinstance(row['two-letter'], str):
        ipa[row['IPA']] = row['one-letter']
    else:
        print(row['IPA'], row['one-letter'], '...pair skipped')
        print('Find on row:', i)
        print('Full content of that row:\n', row)
    
    # some cleanup, here for diphthongs. For these vowels we use a single character, some of which are not true IPA
    # ipa simplex version:
    ipa['o'] = 'o'
    ipa['e'] = 'e'
    
    # non ipa simpex version
    ipa['A'] = 'A'
    ipa['W'] = 'W'
    ipa['O'] = 'O'
    
    # also we want to use ~ for schwa (which isn't the ARPA convention)
    ipa['ə'] = '~'
    
ipa

nan nan ...pair skipped
Find on row: 51
Full content of that row:
 one-letter      NaN
two-letter      NaN
IPA             NaN
klattese        NaN
wpa               Y
Example        spew
Notes           NaN
Sound class     NaN
Name: 51, dtype: object


{'ɑ': 'a',
 'o': 'o',
 'e': 'e',
 'A': 'A',
 'W': 'W',
 'O': 'O',
 'ə': '~',
 'æ': '@',
 'ʌ': 'A',
 'ɔ': 'c',
 'aʊ': 'W',
 'ɚ': nan,
 'aɪ': 'Y',
 'ɛ': 'E',
 'ɝ': 'R',
 'eɪ': 'e',
 'ɪ': 'I',
 'ɨ': 'X',
 'i': 'i',
 'oʊ': 'o',
 'ɔɪ': 'O',
 'ʊ': 'U',
 'u': 'u',
 'ʉ': nan,
 'b': 'b',
 'tʃ': 'C',
 'd': 'd',
 'ð': 'D',
 'ɾ': 'F',
 'l̩': 'l',
 'm̩': 'm',
 'n̩': 'n',
 'f': 'f',
 'ɡ': 'g',
 'h': 'h',
 'dʒ': 'J',
 'k': 'k',
 'l': 'l',
 'm': 'm',
 'n': 'n',
 'ŋ': 'G',
 'ɾ̃': nan,
 'p': 'p',
 'ʔ': 'Q',
 'ɹ': 'r',
 's': 's',
 'ʃ': 'S',
 't': 't',
 'θ': 'T',
 'v': 'v',
 'w': 'w',
 'ʍ': 'H',
 'j': 'y',
 'z': 'z',
 'ʒ': 'Z',
 '_': '_'}

In [88]:
def ipa_to_arpabet_one(x):

    return ''.join([ipa[c] for c in x])
    
    

In [89]:
def remove_digit(x):
    
    assert isinstance(x, list), "Something other than a list provided as x"
    assert isinstance(x[0], str), "x should be a list of strings, but it isn't"
    
    def rm(s):
        return ''.join([c for c in s if not c.isdigit()])
    
    
    
    return [rm(e) for e in schwa(x)]

In [131]:
'plaʊt'.replace('aʊ', 'W')

'plWt'

Clean the IPA phoneme codings by removing the vowel length symbol and the four diphthongs.


In [163]:

ipa_corrections = {
    'aʊ': 'W',
    'aɪ': 'A',
    'eɪ': 'e',
    'oʊ': 'o',
    'ɔɪ': 'O',
    'ː': ''}

def clean_ipa(x):
    for s in ipa_corrections.keys():
        if s in x:
            x = x.replace(s, ipa_corrections[s])
    return x



In [164]:
clean_ipa('ghsoʊ')

'ghso'

In [90]:
def arpa_one(x):
    assert isinstance(x[0], str), "x should be a list of strings, but it isn't"
    return ''.join([arpabet[e] for e in x])

Test ARPAbet two-letter to ARPAbet one-letter

In [91]:
arpa_one(remove_digit(cmu['ketchup'][0]))

'kEC~p'

Test IPA to one-letter ARPAbet

In [94]:
ipa_to_arpabet_one('kokæfəni')

'kok@f~ni'

In [114]:
arpa_one(remove_digit(cmu['a'][0]))

'~'

In [122]:
with open('data/all_orth_phon.csv', 'w') as f:
    for word in orth:
        if word in misspelling_pronunciations_.keys():
            phon = ipa_to_arpabet_one(misspelling_pronunciations_[word])
            misspelling = True
        else:
            phon = arpa_one(remove_digit(cmu[word][0]))
            misspelling = False
        f.write('{}, {}, {}\n'.format(word, phon, misspelling))
f.close()
        

KeyError: 'abasement'

Inspect missing words (from CMU dictionary)

In [123]:
missing = []
for word in orth:
    if word in misspelling_pronunciations_.keys() or word in cmu.keys():
        pass
    else:
        missing.append(word)
