In [91]:
import panphon
import panphon.distance
from fastdtw import fastdtw
import numpy as np
from scipy.spatial.distance import euclidean

In [92]:
TARGET = 'ɔliŋkɑɹdsʔɑɹðəweɪvəvðifjutʃɹ'
TARGET_BY_WORDS = [
    ('Calling', 'ɔliŋ'),
    ('cards', 'kɑɹdsʔ'),
    ('are', 'ɑɹ'),
    ('the', 'ðə'),
    ('wave', 'weɪv'),
    ('of', 'əv'),
    ('the', 'ði'),
    ('future', 'fjutʃɹ'),
]
# SPEECH = 'ɔɪliŋtɑtsəvðəweɪfəmðifjuttʌ'
SPEECH = 'aɪɹniŋkɑtsʌðəweɪvʌvðɛijoʊttʃʌ'

In [93]:
# Create a panphon feature table
ft = panphon.FeatureTable()

# Compute similarity between two phonemes
def phoneme_similarity(phoneme1, phoneme2):
    return panphon.distance.Distance().weighted_feature_edit_distance(phoneme1, phoneme2)

# Convert a phoneme to a numerical feature vector
def phoneme_to_vector(phoneme):
    vectors = ft.word_to_vector_list(phoneme, numeric=True)
    if vectors:
        return np.array(vectors[0])  # Take the first vector if multiple exist
    else:
        return None  # Invalid phoneme

# Convert sequences of phonemes to sequences of vectors
def sequence_to_vectors(seq):
    return [phoneme_to_vector(p) for p in seq if phoneme_to_vector(p) is not None]


In [94]:
def needleman_wunsch(seq1, seq2, similarity_func, gap_penalty=-1):
    n, m = len(seq1), len(seq2)
    dp = np.zeros((n + 1, m + 1))
    
    # Initialize DP table
    for i in range(n + 1):
        dp[i][0] = i * gap_penalty
    for j in range(m + 1):
        dp[0][j] = j * gap_penalty
    
    # Fill DP table
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            match = dp[i - 1][j - 1] + similarity_func(seq1[i - 1], seq2[j - 1])
            delete = dp[i - 1][j] + gap_penalty
            insert = dp[i][j - 1] + gap_penalty
            dp[i][j] = max(match, delete, insert)
    
    # Traceback to get alignment
    i, j = n, m
    aligned_seq1, aligned_seq2 = [], []
    
    while i > 0 or j > 0:
        current = dp[i][j]
        if i > 0 and j > 0 and current == dp[i - 1][j - 1] + similarity_func(seq1[i - 1], seq2[j - 1]):
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append(seq2[j - 1])
            i -= 1
            j -= 1
        elif i > 0 and current == dp[i - 1][j] + gap_penalty:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append('-')
            i -= 1
        else:
            aligned_seq1.append('-')
            aligned_seq2.append(seq2[j - 1])
            j -= 1
    
    return ''.join(reversed(aligned_seq1)), ''.join(reversed(aligned_seq2))


In [95]:
needleman_wunsch('phat', 'bat', phoneme_similarity)

('phat', 'bat-')

In [96]:
def fastdtw_phoneme_alignment(seq1, seq2):
    # Convert phoneme sequences to feature vector sequences
    seq1_vectors = sequence_to_vectors(seq1)
    seq2_vectors = sequence_to_vectors(seq2)
    
    if not seq1_vectors or not seq2_vectors:
        raise ValueError("One or both sequences could not be converted to feature vectors.")
    
    # Use FastDTW with Euclidean distance on the vectors
    distance, path = fastdtw(seq1_vectors, seq2_vectors, dist=euclidean)
    
    # Align the original phoneme sequences based on the path
    aligned_seq1 = []
    aligned_seq2 = []
    for i, j in path:
        aligned_seq1.append(seq1[i] if i < len(seq1) else '-')
        aligned_seq2.append(seq2[j] if j < len(seq2) else '-')
    
    return ''.join(aligned_seq1), ''.join(aligned_seq2)

In [97]:
fastdtw_phoneme_alignment(TARGET, SPEECH)

('ɔɔɔliŋkɑɹdsʔɑɹðəweɪvəvðiifjuuttʃɹ', 'aɪɹniŋkɑɑtsʌʌʌðəweɪvʌvðɛijjoʊttʃʌ')

In [98]:
for pair in zip(*fastdtw_phoneme_alignment(TARGET, SPEECH)):
    print(pair)

('ɔ', 'a')
('ɔ', 'ɪ')
('ɔ', 'ɹ')
('l', 'n')
('i', 'i')
('ŋ', 'ŋ')
('k', 'k')
('ɑ', 'ɑ')
('ɹ', 'ɑ')
('d', 't')
('s', 's')
('ʔ', 'ʌ')
('ɑ', 'ʌ')
('ɹ', 'ʌ')
('ð', 'ð')
('ə', 'ə')
('w', 'w')
('e', 'e')
('ɪ', 'ɪ')
('v', 'v')
('ə', 'ʌ')
('v', 'v')
('ð', 'ð')
('i', 'ɛ')
('i', 'i')
('f', 'j')
('j', 'j')
('u', 'o')
('u', 'ʊ')
('t', 't')
('t', 't')
('ʃ', 'ʃ')
('ɹ', 'ʌ')


In [99]:
pair_by_words = []
pairs = iter(zip(*fastdtw_phoneme_alignment(TARGET, SPEECH)))
cur_pair = next(pairs)
start = []
for word, phons in TARGET_BY_WORDS:
    phons = list(phons)
    ps = start
    while len(phons) > 0:
        t, s = cur_pair
        if t != phons[0]:
            phons.pop(0)
        ps.append(cur_pair)
        try:
            cur_pair = next(pairs)
        except StopIteration:
            break
    pair_by_words.append((word, ps[:-1]))
    start = [ps[-1]]


In [100]:
pair_by_words

[('Calling',
  [('ɔ', 'a'), ('ɔ', 'ɪ'), ('ɔ', 'ɹ'), ('l', 'n'), ('i', 'i'), ('ŋ', 'ŋ')]),
 ('cards',
  [('k', 'k'), ('ɑ', 'ɑ'), ('ɹ', 'ɑ'), ('d', 't'), ('s', 's'), ('ʔ', 'ʌ')]),
 ('are', [('ɑ', 'ʌ'), ('ɹ', 'ʌ')]),
 ('the', [('ð', 'ð'), ('ə', 'ə')]),
 ('wave', [('w', 'w'), ('e', 'e'), ('ɪ', 'ɪ'), ('v', 'v')]),
 ('of', [('ə', 'ʌ'), ('v', 'v')]),
 ('the', [('ð', 'ð'), ('i', 'ɛ'), ('i', 'i')]),
 ('future',
  [('f', 'j'),
   ('j', 'j'),
   ('u', 'o'),
   ('u', 'ʊ'),
   ('t', 't'),
   ('t', 't'),
   ('ʃ', 'ʃ')])]

In [101]:
average_score = 0
for word, pairs in pair_by_words:
    cer = sum(1 for t, s in pairs if t != s) / len(pairs)
    seq1, seq2 = map(lambda x: ''.join(x), zip(*pairs))
    print(word, seq1, seq2, (1 - cer / 2))
    average_score += (1 - cer / 2)
average_score /= len(pair_by_words)
print('Average score:', average_score)

Calling ɔɔɔliŋ aɪɹniŋ 0.6666666666666667
cards kɑɹdsʔ kɑɑtsʌ 0.75
are ɑɹ ʌʌ 0.5
the ðə ðə 1.0
wave weɪv weɪv 1.0
of əv ʌv 0.75
the ðii ðɛi 0.8333333333333334
future fjuuttʃ jjoʊttʃ 0.7857142857142857
Average score: 0.7857142857142857


In [102]:
average_score = 0
for word, pairs in pair_by_words:
    seq1, seq2 = map(lambda x: ''.join(x), zip(*pairs))
    norm_score = (22 - panphon.distance.Distance().weighted_feature_edit_distance(seq1, seq2)) / 22
    print(word, seq1, seq2, norm_score**2)
    average_score += norm_score**2
average_score /= len(pair_by_words)
print('Average score:', average_score)

Calling ɔɔɔliŋ aɪɹniŋ 0.6418194731404959
cards kɑɹdsʔ kɑɑtsʌ 0.6192616864669422
are ɑɹ ʌʌ 0.7957450929752067
the ðə ðə 1.0
wave weɪv weɪv 1.0
of əv ʌv 0.9886686466942147
the ðii ðɛi 0.966199638429752
future fjuuttʃ jjoʊttʃ 0.6834404054752066
Average score: 0.8368918678977274


In [103]:
sound_descriptions = {
  "a": {
    "phonemicSpelling": "ah",
    "description": "An open front unrounded vowel. Open your mouth wide, position the tongue low and towards the front, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'father'."
  },
  "b": {
    "phonemicSpelling": "buh",
    "description": "A voiced bilabial stop. Press both lips together, then release while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'bat'."
  },
  "d": {
    "phonemicSpelling": "duh",
    "description": "A voiced alveolar stop. Place the tongue against the alveolar ridge, stop airflow, then release while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'dog'."
  },
  "e": {
    "phonemicSpelling": "ay",
    "description": "A close-mid front unrounded vowel. Keep the tongue mid-high and towards the front, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'say' (in non-rhotic accents)."
  },
  "f": {
    "phonemicSpelling": "fuh",
    "description": "A voiceless labiodental fricative. Place the upper teeth against the lower lip and push air through without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'fish'."
  },
  "h": {
    "phonemicSpelling": "huh",
    "description": "A voiceless glottal fricative. Push air through the open vocal cords without vibrating them.",
    "exampleWord": "This is the initial sound in 'hat'."
  },
  "i": {
    "phonemicSpelling": "ee",
    "description": "A close front unrounded vowel. Raise the tongue high and towards the front, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'see'."
  },
  "j": {
    "phonemicSpelling": "yuh",
    "description": "A voiced palatal approximant. Place the tongue close to the hard palate without touching, and vibrate the vocal cords.",
    "exampleWord": "This is the initial sound in 'yes'."
  },
  "k": {
    "phonemicSpelling": "kuh",
    "description": "A voiceless velar stop. Place the back of the tongue against the soft palate, stop airflow, then release without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'cat'."
  },
  "l": {
    "phonemicSpelling": "luh",
    "description": "A voiced alveolar lateral approximant. Place the tongue against the alveolar ridge, allowing air to pass along the sides while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'leaf'."
  },
  "m": {
    "phonemicSpelling": "muh",
    "description": "A voiced bilabial nasal. Press both lips together, and let air pass through the nose while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'man'."
  },
  "n": {
    "phonemicSpelling": "nuh",
    "description": "A voiced alveolar nasal. Place the tongue against the alveolar ridge, and let air pass through the nose while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'net'."
  },
  "o": {
    "phonemicSpelling": "oh",
    "description": "A close-mid back rounded vowel. Round the lips, keep the tongue mid-high and towards the back, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'go' (in non-rhotic accents)."
  },
  "p": {
    "phonemicSpelling": "puh",
    "description": "A voiceless bilabial stop. Press both lips together, then release without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'pat'."
  },
  "s": {
    "phonemicSpelling": "sss",
    "description": "A voiceless alveolar fricative. Place the tongue near the alveolar ridge, and push air through without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'sit'."
  },
  "t": {
    "phonemicSpelling": "tuh",
    "description": "A voiceless alveolar stop. Place the tongue against the alveolar ridge, stop airflow, then release without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'top'."
  },
  "u": {
    "phonemicSpelling": "oo",
    "description": "A close back rounded vowel. Round the lips, keep the tongue high and towards the back, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'blue'."
  },
  "v": {
    "phonemicSpelling": "vuh",
    "description": "A voiced labiodental fricative. Place the upper teeth against the lower lip, push air through, and vibrate the vocal cords.",
    "exampleWord": "This is the initial sound in 'van'."
  },
  "w": {
    "phonemicSpelling": "wuh",
    "description": "A voiced labio-velar approximant. Round the lips and raise the back of the tongue towards the soft palate while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'win'."
  },
  "z": {
    "phonemicSpelling": "zzz",
    "description": "A voiced alveolar fricative. Place the tongue near the alveolar ridge, push air through, and vibrate the vocal cords.",
    "exampleWord": "This is the initial sound in 'zebra'."
  },
  "\u00e6": {
    "phonemicSpelling": "ah",
    "description": "A near-open front unrounded vowel. Open your mouth widely and position the tongue low and towards the front.",
    "exampleWord": "This is the vowel sound in 'cat'."
  },
  "\u00f0": {
    "phonemicSpelling": "th",
    "description": "A voiced dental fricative. Place the tongue between the teeth, push air through, and vibrate the vocal cords.",
    "exampleWord": "This is the initial sound in 'this'."
  },
  "\u014b": {
    "phonemicSpelling": "ng",
    "description": "A voiced velar nasal. Place the back of the tongue against the soft palate, and let air pass through the nose while vibrating the vocal cords.",
    "exampleWord": "This is the final sound in 'sing'."
  },
  "\u0251": {
    "phonemicSpelling": "ah",
    "description": "An open back unrounded vowel. Open your mouth wide, position the tongue low and towards the back, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in 'spa' (in non-rhotic accents)."
  },
  "\u0254": {
    "phonemicSpelling": "aw",
    "description": "An open-mid back rounded vowel. Round the lips and lower the tongue towards the back.",
    "exampleWord": "This is the vowel sound in 'thought' (in non-rhotic accents)."
  },
  "\u0259": {
    "phonemicSpelling": "uh",
    "description": "A mid-central unrounded vowel. Keep the tongue relaxed and central, and vibrate the vocal cords.",
    "exampleWord": "This is the vowel sound in the first syllable of 'about'."
  },
  "\u025b": {
    "phonemicSpelling": "eh",
    "description": "An open-mid front unrounded vowel. Lower the tongue slightly towards the front.",
    "exampleWord": "This is the vowel sound in 'bed'."
  },
  "\u0261": {
    "phonemicSpelling": "guh",
    "description": "A voiced velar stop. Place the back of the tongue against the soft palate, stop airflow, then release while vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'go'."
  },
  "\u026a": {
    "phonemicSpelling": "ih",
    "description": "A near-close front unrounded vowel. Raise the tongue high and towards the front, but not as close as /i/.",
    "exampleWord": "This is the vowel sound in 'sit'."
  },
  "\u0279": {
    "phonemicSpelling": "ruh",
    "description": "A voiced alveolar approximant. Curl the tongue towards the alveolar ridge without touching, and vibrate the vocal cords.",
    "exampleWord": "This is the initial sound in 'red'."
  },
  "\u027e": {
    "phonemicSpelling": "flap",
    "description": "A voiced alveolar tap. Quickly tap the tongue against the alveolar ridge.",
    "exampleWord": "This is the middle sound in 'butter' (in American English)."
  },
  "\u0283": {
    "phonemicSpelling": "sh",
    "description": "A voiceless postalveolar fricative. Place the tongue near the roof of the mouth, just behind the alveolar ridge, and push air through.",
    "exampleWord": "This is the initial sound in 'shoe'."
  },
  "\u028a": {
    "phonemicSpelling": "uh",
    "description": "A near-close back rounded vowel. Round the lips and raise the tongue towards the back.",
    "exampleWord": "This is the vowel sound in 'put'."
  },
  "\u028c": {
    "phonemicSpelling": "uh",
    "description": "An open-mid back unrounded vowel. Lower the tongue towards the back and open the mouth slightly.",
    "exampleWord": "This is the vowel sound in 'cup'."
  },
  "\u0292": {
    "phonemicSpelling": "zh",
    "description": "A voiced postalveolar fricative. Place the tongue near the roof of the mouth, just behind the alveolar ridge, and push air through while vibrating the vocal cords.",
    "exampleWord": "This is the middle sound in 'measure'."
  },
  "\u0294": {
    "phonemicSpelling": "glottal stop",
    "description": "A voiceless glottal stop. Close the vocal cords briefly, then release to produce a stop sound.",
    "exampleWord": "This is the catch in the middle of 'uh-oh'."
  },
  "\u03b8": {
    "phonemicSpelling": "th",
    "description": "A voiceless dental fricative. Place the tongue between the teeth and push air through without vibrating the vocal cords.",
    "exampleWord": "This is the initial sound in 'think'."
  }
}


In [104]:
for word, pairs in pair_by_words:    
    wrongest_pair = pairs[0]
    wrongest_pair_dist = panphon.distance.Distance().weighted_feature_edit_distance(wrongest_pair[0], wrongest_pair[1])
    for p in pairs:
        dist = panphon.distance.Distance().weighted_feature_edit_distance(p[0], p[1])
        if dist > wrongest_pair_dist:
            wrongest_pair = p
            wrongest_pair_dist = dist
    print(wrongest_pair)
    if wrongest_pair_dist == 0:
        print('Your pronunciation of "' + word + '" is perfect!')
    else:
        target, speech = wrongest_pair
        t, s = sound_descriptions[target], sound_descriptions[speech]
        print(f"""You pronounced the "{t['phonemicSpelling']}" sound in {word} as the "{s['phonemicSpelling']}" sound.""")
        print(f"""It is supposed to be {t['description'][0].lower() + t['description'][1:]}""")
        print(t['exampleWord'])


('ɔ', 'ɹ')
You pronounced the "aw" sound in Calling as the "ruh" sound.
It is supposed to be an open-mid back rounded vowel. Round the lips and lower the tongue towards the back.
This is the vowel sound in 'thought' (in non-rhotic accents).
('ɹ', 'ɑ')
You pronounced the "ruh" sound in cards as the "ah" sound.
It is supposed to be a voiced alveolar approximant. Curl the tongue towards the alveolar ridge without touching, and vibrate the vocal cords.
This is the initial sound in 'red'.
('ɹ', 'ʌ')
You pronounced the "ruh" sound in are as the "uh" sound.
It is supposed to be a voiced alveolar approximant. Curl the tongue towards the alveolar ridge without touching, and vibrate the vocal cords.
This is the initial sound in 'red'.
('ð', 'ð')
Your pronunciation of "the" is perfect!
('w', 'w')
Your pronunciation of "wave" is perfect!
('ə', 'ʌ')
You pronounced the "uh" sound in of as the "uh" sound.
It is supposed to be a mid-central unrounded vowel. Keep the tongue relaxed and central, and vib