In [34]:
from itertools import product
import numpy as np
import string

phone_feature_map = {
    'M': ('blb', 'nas'),
    'P': ('vls', 'blb', 'stp'),
    'B': ('vcd', 'blb', 'stp'),
    'F': ('vls', 'lbd', 'frc'),
    'V': ('vcd', 'lbd', 'frc'),
    'TH': ('vls', 'dnt', 'frc'),
    'DH': ('vcd', 'dnt', 'frc'),
    'N': ('alv', 'nas'),
    'T': ('vls', 'alv', 'stp'),
    'D': ('vcd', 'alv', 'stp'),
    'S': ('vls', 'alv', 'frc'),
    'Z': ('vcd', 'alv', 'frc'),
    'R': ('alv', 'apr'),
    'L': ('alv', 'lat'),
    'SH': ('vls', 'pla', 'frc'),
    'ZH': ('vcd', 'pla', 'frc'),
    'Y': ('pal', 'apr'),
    'NG': ('vel', 'nas'),
    'K': ('vls', 'vel', 'stp'),
    'G': ('vcd', 'vel', 'stp'),
    'W': ('lbv', 'apr'),
    'HH': ('glt', 'apr'),
    'CH': ('vls', 'alv', 'stp', 'frc'),
    'JH': ('vcd', 'alv', 'stp', 'frc'),
    'AO': ('lmd', 'bck', 'rnd', 'vwl'),
    'AA': ('low', 'bck', 'unr', 'vwl'),
    'IY': ('hgh', 'fnt', 'unr', 'vwl'),
    'UW': ('hgh', 'bck', 'rnd', 'vwl'),
    'EH': ('lmd', 'fnt', 'unr', 'vwl'),
    'IH': ('smh', 'fnt', 'unr', 'vwl'),
    'UH': ('smh', 'bck', 'rnd', 'vwl'),
    'AH': ('mid', 'cnt', 'unr', 'vwl'),
    'AE': ('low', 'fnt', 'unr', 'vwl'),
    'EY': ('lmd', 'smh', 'fnt', 'unr', 'vwl'),
    'AY': ('low', 'smh', 'fnt', 'cnt', 'unr', 'vwl'),
    'OW': ('umd', 'smh', 'bck', 'rnd', 'vwl'),
    'AW': ('low', 'smh', 'bck', 'cnt', 'unr', 'rnd', 'vwl'),
    'OY': ('lmd', 'smh', 'bck', 'fnt', 'rnd', 'unr', 'vwl'),
    'ER': ('umd', 'cnt', 'rzd', 'vwl'),
    '^': ('beg',),
    '$': ('end',)
}

In [35]:
with open('cmudict-0.7b-with-vitz-nonce - NR -- UPDATED - Copy.txt', 'r') as f:
    df = f.readlines()
df = [i.strip() for i in df]

In [36]:
def keep_letters(x):
    return "".join([i for i in x if i in string.ascii_letters])

In [40]:
pronounce_dic = {}
for i in df:
    if '#' not in i:
        data_row = i.split('  ')
        word = data_row[0].lower()
        phonemes = data_row[1].split(' ')
        phonemes = [keep_letters(x) for x in phonemes if keep_letters(x)!='']
        pronounce_dic[word] = phonemes

In [61]:
feature_bigrams = {}
for i in pronounce_dic:
    phonemes = ['^'] + pronounce_dic[i] + ['$']
    bigram_pairs = []
    for x in range(0, (len(phonemes)-1)):
        bigram_pairs.extend(list(product(phone_feature_map[phonemes[x]], phone_feature_map[phonemes[x+1]])))
    feature_bigrams[i] = bigram_pairs

In [62]:
sound_features = []
for i in phone_feature_map:
    for x in phone_feature_map[i]:
        sound_features.append(x)

In [63]:
sound_features = list(set(sound_features))

In [64]:
sound_dic = {}
for i in range(0, len(sound_features)):
    sound_dic[sound_features[i]] = i

In [65]:
dim = 300

In [66]:
sound_matrix = np.random.normal(0, 1/np.sqrt(dim), size=(len(sound_dic), dim))
sound_matrix = np.array(sound_matrix, dtype=np.float32)

In [67]:
def conv_circ(signal, ker):
    return np.real(np.fft.ifft(np.fft.fft(signal)*np.fft.fft(ker)))

In [68]:
nums = np.array([i for i in range(0, dim)])
E1 = np.random.permutation(nums)
E2 = np.random.permutation(nums)

In [69]:
word_matrix = []
for i in feature_bigrams:
    phon_vec = np.zeros(300, dtype=np.float32)
    for x in feature_bigrams[i]:
        sound_one = sound_matrix[sound_dic[x[0]]]
        sound_one = sound_one[E1]
        sound_two = sound_matrix[sound_dic[x[1]]]
        sound_two = sound_two[E2]
        phon_vec += conv_circ(sound_one, sound_two)
    word_matrix.append(phon_vec)

In [78]:
word_matrix = np.array(word_matrix, dtype=np.float32)

In [70]:
word_list = list(feature_bigrams)
word_dic = {}
for i in range(0, len(word_list)):
    word_dic[word_list[i]] = i

In [71]:
def sel_second(x):
    return x[1]

In [74]:
def normalize_vec(x):
    if np.sum(x**2) > 0:
        normed_vec = x / np.sqrt(np.sum(x**2))
    else:
        normed_vec = x.copy()
    return normed_vec

In [75]:
def normalize_mat(x):
    mag = np.sqrt(np.sum(x**2, axis=1))
    normed_mat = np.transpose(np.transpose(x) / mag)
    return normed_mat

In [76]:
def find_neighbors(word, word_matrix, word_dic, n=20):
    sim = normalize_vec(word_matrix[word_dic[word]]) @ np.transpose(normalize_mat(word_matrix))
    sim_list = list(zip(list(word_dic), sim))
    sim_list.sort(key=sel_second, reverse=True)
    return sim_list[:n]

In [83]:
find_neighbors('football', word_matrix, word_dic, n=20)

[('football', 1.0),
 ("football's", 0.9297924),
 ('footballs', 0.9297924),
 ('footfall', 0.77157676),
 ('foot', 0.7509226),
 ('foote', 0.7509226),
 ('footwall', 0.7390779),
 ('full', 0.7340092),
 ('vuitton', 0.730436),
 ('fulmore', 0.714161),
 ('fulp', 0.7121511),
 ('spitball', 0.7110773),
 ('foothold', 0.70784193),
 ('spoonful', 0.7062919),
 ('foal', 0.7054054),
 ('foale', 0.7054054),
 ('fohl', 0.7054054),
 ('pfohl', 0.7054054),
 ('photocall', 0.70331013),
 ('footman', 0.6972866)]