### Syllabification

In [1]:
#get all onsets
Vs = set(['ɝ', 'o', 'ʊ', 'ɑ', 'ɔ', 'ə', 'ɜ', 'æ', 'ɚ', 'u', 'e', 'ɒ', 'ɛ', 'i', 'a', 'ɪ', 'ʌ'])
Cs = set(['p', 't', 'k', 'b', 'd', 'ɡ','g', 'f', 'θ', 's', 'ʃ', 'h', 'v', 
      'ð', 'z', 'ʒ', 'ʧ', 'ʤ', 'l', 'm', 'n', 'r', 'w', 'j','ŋ', 'z'])

diphtongs = set(['aʊ','aɪ', 'eɪ', 'oʊ','ɔɪ'])
symbols = set(['ː','ˈ', 'ˌ'])
all_onsets = set()
#need to download 
with open('cmudict-ipa/cmudict-0.7b-ipa.txt','r', encoding='utf-8') as f:
    for line in f:
        line = line.split('\t')[-1].split(", ")[0]
        line = line.strip()
        #delete first stress
        line = line.strip('ˈ')
        line = line.strip('ˌ')
        
        line = line.replace("tʃ", "ʧ").replace("ː",'').replace("ɝ", "ɜr").replace("dʒ","ʤ")
        vowel_pos = [idx for idx, char in enumerate(line) if char in Vs]
        first_vowel = vowel_pos[0]
        if ('ˌ' in line) or ('ˈ' in line):
            first_stress = min(line.find('ˌ'), line.find('ˈ'))
            j = min(first_stress, first_vowel)
        else:
            j = first_vowel
        all_onsets.add(line[:j])

all_onsets.remove('')

In [2]:
def syllabify(phoneme):
    vowel_pos = [idx for idx, char in enumerate(phoneme) if char in Vs]
    onset, coda, coda_onset = "", "", ""
    syllables = []
    i=0
    while i<len(vowel_pos):
        #if no onset from previous iteration
        if i==0:
            onset = phoneme[:vowel_pos[i]]
        #check if vowel is diphtongs
        if (i+1)<len(vowel_pos) and (vowel_pos[i+1]-vowel_pos[i]==1):
            if phoneme[vowel_pos[i]:vowel_pos[i]+2] in diphtongs:
                nucleus = phoneme[vowel_pos[i]:vowel_pos[i]+2]
                i += 1
            else:
                nucleus = phoneme[vowel_pos[i]]
        else:
            nucleus = phoneme[vowel_pos[i]]
        
        syllable = onset+nucleus
        
        #if last vowel
        if (i+1)==len(vowel_pos):
            coda_onset = phoneme[vowel_pos[i]+1:]
            syllables.append(syllable+coda_onset)
            i+=1
            continue
        else:
            coda_onset = phoneme[vowel_pos[i]+1:vowel_pos[i+1]]
            
        #if coda_onset is empty:
        if coda_onset=="":
            syllables.append(syllable)
            onset = ""
            i+=1 
            continue
        #if there is a stress mark between vowels, split by stress mark
        elif ('ˈ' in coda_onset) or ('ˌ' in coda_onset):
            if ('ˈ' in coda_onset):
                stress = 'ˈ' 
            else:
                stress = 'ˌ'
            coda, onset = coda_onset.split(stress)
            onset = stress + onset
        else:
            valid_coda_onset_flag = True
            #onset maximization
            for j in range(len(coda_onset)):
                
                if coda_onset[j:] in all_onsets:
                    coda = coda_onset[:j]
                    onset = coda_onset[j:]
                    valid_coda_onset_flag = False
                    break
            if valid_coda_onset_flag:
                coda = coda_onset
                onset = ""
        syllables.append(syllable+coda)
        i+=1
    return syllables

        


In [3]:
syllabify("ˈnɔrðərn")

['ˈnɔr', 'ðərn']

### Consonantal Phonetic Coverage Table

In [4]:
target = ['p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'ʧ', 'ʤ', 'm', 'n', 'ŋ', 'w', 'j', 'l', 'r']
target_set = set(target)
target2index = {ch:i for i, ch in  enumerate(target)}

In [5]:
import pandas as pd
from collections import Counter
def get_stat(data):
    #word initial
    initial_mono = {"v":Counter(),
                    "c":Counter()
                   }
    initial_multi = {"stress": {"v":Counter(),
                                "c":Counter()},
                     "unstress":{"v":Counter(),
                                 "c":Counter()}
                    }
    #word final
    final_mono = {"v":Counter(),
                    "c":Counter()
                   }
    final_multi = {"stress":{"v":Counter(),
                             "c":Counter()},
                   "unstress":{"v":Counter(),
                               "c":Counter()}}

    #word_medial
    medial_mono = {"v_c":Counter(),
                   "c_v":Counter(),
                   "c_c":Counter()}
    medial_multi = {"stress": {
                           "v_c":Counter(),
                           "c_v":Counter(),
                           "v_v":Counter(),
                           "c_c":Counter()
                            },
                    "unstress": {
                            "v_c":Counter(),
                            "c_v":Counter(),
                            "v_v":Counter(),
                            "c_c":Counter()
                            }
                   }
    for syls in data:
        #monosylabic
        if len(syls)==1:
            syl = syls[0]
            syl = syl.strip('ˈ').strip('ˌ')
            if len(syl)>1:
                for i in range(0,len(syl)):
                    if syl[i] in target_set:
                        #word initial
                        if i==0:
                            #if #_v
                            if syl[1] in Vs:
                                initial_mono['v'][syl[0]] += 1
                            #if #_c
                            elif syl[1] in Cs:
                                initial_mono['c'][syl[0]] += 1
                        #word final
                        elif i==len(syl)-1:
                            #if v_#
                            if syl[-2] in Vs:
                                final_mono['v'][syl[-1]] += 1
                            #if c_#
                            elif syl[-2] in Cs:
                                final_mono['c'][syl[-1]] += 1
                        #word_medial
                        else:
                            #v_c
                            if (syl[i-1] in Vs) and (syl[i+1] in Cs):
                                medial_mono['v_c'][syl[i]] += 1
                            #c_v
                            if (syl[i-1] in Cs) and (syl[i+1] in Vs):
                                medial_mono['c_v'][syl[i]] += 1
                            #c_c
                            if (syl[i-1] in Cs) and (syl[i+1] in Cs):
                                medial_mono['c_c'][syl[i]] += 1
        #multisylabic
        else:
            phoneme = ""
            stressed_ids = set()
            for syl in syls:
                if ('ˈ' in syl): # or ('ˌ' in syl)
                    stress = True
                else:
                    stress = False
                syl = syl.strip('ˈ').strip('ˌ')
                if stress:
                    for idx, _ in enumerate(syl):
                        stressed_ids.add(len(phoneme)+idx)
                phoneme += syl

            for i in range(len(phoneme)):
                if i in stressed_ids:
                    stress = 'stress'
                else:
                    stress = 'unstress'
                if phoneme[i] in target_set:
                    ###word initial
                    if i==0:
                        #if #_v
                        if phoneme[1] in Vs:
                            initial_multi[stress]['v'][phoneme[0]] += 1
                        elif phoneme[1] in Cs:
                            initial_multi[stress]['c'][phoneme[0]] += 1
                    #word final
                    elif i==len(phoneme)-1:
                        #if v_#
                        if phoneme[-2] in Vs:
                            final_multi[stress]['v'][phoneme[-1]] += 1
                        #if c_#
                        elif phoneme[-2] in Cs:
                            final_multi[stress]['c'][phoneme[-1]] += 1
                    else:
                        #word_medial
                        #v_c
                        if (phoneme[i-1] in Vs) and (phoneme[i+1] in Cs):
                            medial_multi[stress]['v_c'][phoneme[i]] += 1
                        #c_v
                        if (phoneme[i-1] in Cs) and (phoneme[i+1] in Vs):
                            medial_multi[stress]['c_v'][phoneme[i]] += 1
                        #c_c
                        if (phoneme[i-1] in Cs) and (phoneme[i+1] in Cs):
                            medial_multi[stress]['c_c'][phoneme[i]] += 1
                        if (phoneme[i-1] in Vs) and (phoneme[i+1] in Vs):
                            medial_multi[stress]['v_v'][phoneme[i]] += 1
    pd.set_option('display.max_columns', 500)
    df = pd.DataFrame({"initial_mono_v": [initial_mono['v'].get(el, 0) for el in target],
                  "initial_multi_stressed_v": [initial_multi['stress']['v'].get(el, 0) for el in target],
                  "initial_multi_unstressed_v": [initial_multi['unstress']['v'].get(el, 0) for el in target],
                  "initial_mono_c": [initial_mono['c'].get(el, 0) for el in target],
                  "initial_multi_stressed_c": [initial_multi['stress']['c'].get(el, 0) for el in target],
                  "initial_multi_unstressed_c": [initial_multi['unstress']['c'].get(el, 0) for el in target],

                  "final_mono_v": [final_mono['v'].get(el, 0) for el in target],
                  "final_multi_stressed_v": [final_multi['stress']['v'].get(el, 0) for el in target],
                  "final_multi_unstressed_v": [final_multi['unstress']['v'].get(el, 0) for el in target],
                  "final_mono_c": [final_mono['c'].get(el, 0) for el in target],
                  "final_multi_stressed_c": [final_multi['stress']['c'].get(el, 0) for el in target],
                  "final_multi_unstressed_c": [final_multi['unstress']['c'].get(el, 0) for el in target],

                  "medial_mono_v_c": [medial_mono['v_c'].get(el, 0) for el in target],
                  "medial_multi_stressed_v_c": [medial_multi['stress']['v_c'].get(el, 0) for el in target],
                  "medial_multi_unstressed_v_c": [medial_multi['unstress']['v_c'].get(el, 0) for el in target],

                  "medial_mono_c_v": [medial_mono['c_v'].get(el, 0) for el in target],
                  "medial_multi_stressed_c_v": [medial_multi['stress']['c_v'].get(el, 0) for el in target],
                  "medial_multi_unstressed_c_v": [medial_multi['unstress']['c_v'].get(el, 0) for el in target],

                  "medial_multi_stressed_v_v": [medial_multi['stress']['v_v'].get(el, 0) for el in target],
                  "medial_multi_unstressed_v_v": [medial_multi['unstress']['v_v'].get(el, 0) for el in target], 

                  "medial_mono_c_c": [medial_mono['c_c'].get(el, 0) for el in target],
                  "medial_multi_stressed_c_c": [medial_multi['stress']['c_c'].get(el, 0) for el in target],
                  "medial_multi_unstressed_c_c": [medial_multi['unstress']['c_c'].get(el, 0) for el in target], 
                 }, index=target)
    #not allowed
    df.loc[['ʒ', 'ŋ'], ['initial_mono_v', 'initial_multi_stressed_v', 'initial_multi_unstressed_v']] = None
    df.loc[['ʒ', 'h', 'ŋ', 'w','j','l','r'], ['initial_mono_c', 'initial_multi_stressed_c', 'initial_multi_unstressed_c']] = None
    df.loc[['h', 'w','j'], ['final_mono_v', 'final_multi_stressed_v', 'final_multi_unstressed_v']] = None
    df.loc[['ʒ', 'h', 'ŋ', 'w','j', 'r'], ['final_mono_c', 'final_multi_stressed_c', 'final_multi_unstressed_c']] = None
    df.loc[['ʒ', 'h', 'w','j'], ['medial_mono_v_c', 'medial_multi_stressed_v_c', 'medial_multi_unstressed_v_c']] = None
    df.loc[['ŋ'], ['medial_mono_c_v', 'medial_multi_stressed_c_v', 'medial_multi_unstressed_c_v']] = None
    df.loc[['ʒ', 'h', 'ŋ', 'w','j', 'r'], ['medial_mono_c_c', 'medial_multi_stressed_c_c', 'medial_multi_unstressed_c_c']] = None


    return df

In [6]:
with open('texts/Caterpillar_ph.txt','r', encoding='utf-8') as f:
    ph = f.read()
data = [syllabify(phoneme) for phoneme in ph.split()]
Caterpillar = get_stat(data)

with open('texts/Grandfather_ph.txt','r', encoding='utf-8') as f:
    ph = f.read()
data = [syllabify(phoneme) for phoneme in ph.split()]
Grandfather = get_stat(data)

with open('texts/Rainbow_ph.txt','r', encoding='utf-8') as f:
    ph = f.read()
data = [syllabify(phoneme) for phoneme in ph.split()]
Rainbow = get_stat(data)



In [7]:
Caterpillar

Unnamed: 0,initial_mono_v,initial_multi_stressed_v,initial_multi_unstressed_v,initial_mono_c,initial_multi_stressed_c,initial_multi_unstressed_c,final_mono_v,final_multi_stressed_v,final_multi_unstressed_v,final_mono_c,final_multi_stressed_c,final_multi_unstressed_c,medial_mono_v_c,medial_multi_stressed_v_c,medial_multi_unstressed_v_c,medial_mono_c_v,medial_multi_stressed_c_v,medial_multi_unstressed_c_v,medial_multi_stressed_v_v,medial_multi_unstressed_v_v,medial_mono_c_c,medial_multi_stressed_c_c,medial_multi_unstressed_c_c
p,3.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,7,2.0,0.0,0.0
b,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2,0.0,0.0,0.0
t,12.0,2.0,0.0,4.0,0.0,0.0,10.0,0.0,0.0,9.0,0.0,4.0,0.0,0.0,2.0,1.0,1.0,6.0,0,7,0.0,1.0,0.0
d,2.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,9.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2,0.0,0.0,0.0
k,5.0,6.0,0.0,1.0,1.0,0.0,3.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,1.0,0,1,3.0,0.0,0.0
g,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,0.0,0.0
f,5.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,0.0
v,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,0.0,0.0
θ,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
ð,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0


In [8]:
Rainbow

Unnamed: 0,initial_mono_v,initial_multi_stressed_v,initial_multi_unstressed_v,initial_mono_c,initial_multi_stressed_c,initial_multi_unstressed_c,final_mono_v,final_multi_stressed_v,final_multi_unstressed_v,final_mono_c,final_multi_stressed_c,final_multi_unstressed_c,medial_mono_v_c,medial_multi_stressed_v_c,medial_multi_unstressed_v_c,medial_mono_c_v,medial_multi_stressed_c_v,medial_multi_unstressed_c_v,medial_multi_stressed_v_v,medial_multi_unstressed_v_v,medial_mono_c_c,medial_multi_stressed_c_c,medial_multi_unstressed_c_c
p,4.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,1.0,4,3,0.0,2.0,2.0
b,14.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,11.0,2,0,0.0,0.0,0.0
t,11.0,1.0,0.0,1.0,0.0,0.0,19.0,3.0,1.0,3.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,0,4,1.0,0.0,2.0
d,0.0,1.0,2.0,2.0,0.0,0.0,7.0,0.0,2.0,13.0,4.0,3.0,4.0,0.0,0.0,0.0,0.0,1.0,1,2,3.0,1.0,2.0
k,1.0,5.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,4.0,1.0,0.0,0.0,1,7,0.0,2.0,0.0
g,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
f,9.0,3.0,2.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1,2,0.0,0.0,0.0
v,0.0,2.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3,0.0,1.0,0.0
θ,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,0.0,0.0
ð,50.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0.0,0.0


In [9]:
Grandfather

Unnamed: 0,initial_mono_v,initial_multi_stressed_v,initial_multi_unstressed_v,initial_mono_c,initial_multi_stressed_c,initial_multi_unstressed_c,final_mono_v,final_multi_stressed_v,final_multi_unstressed_v,final_mono_c,final_multi_stressed_c,final_multi_unstressed_c,medial_mono_v_c,medial_multi_stressed_v_c,medial_multi_unstressed_v_c,medial_mono_c_v,medial_multi_stressed_c_v,medial_multi_unstressed_c_v,medial_multi_stressed_v_v,medial_multi_unstressed_v_v,medial_mono_c_c,medial_multi_stressed_c_c,medial_multi_unstressed_c_c
p,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1,1,0.0,0.0,0.0
b,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.0,0.0,0.0
t,5.0,0.0,0.0,1.0,1.0,0.0,3.0,1.0,0.0,5.0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,2.0,0,1,0.0,2.0,0.0
d,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,2.0,0.0
k,1.0,0.0,0.0,2.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,0.0,1.0,0.0,0,0,1.0,0.0,0.0
g,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,0.0,1.0
f,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0,2,0.0,0.0,0.0
v,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,3,0.0,0.0,0.0
θ,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
ð,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.0,0.0,0.0


In [10]:
df = pd.concat([Caterpillar, Rainbow, Grandfather], axis=1).T
df = df.reset_index()
df.groupby('index').agg(lambda x: (x>0).any()).sum().sum()/466

0.5042918454935622

In [11]:
allowed = 466

In [12]:
(Caterpillar>0).sum().sum()/allowed

0.2553648068669528

In [13]:
(Grandfather>0).sum().sum()/allowed

0.2939914163090129

In [14]:
(Rainbow>0).sum().sum()/allowed

0.37124463519313305

### Calculate statistics from IWSLT phonemes

In [15]:
import pickle
#read iwslt phonemes from pickle
with open("iwslt_phonemes.pkl",'rb') as f:
    res_phonemes = pickle.load(f)

In [16]:
data = [syllabify(phoneme) for phoneme in res_phonemes.split()]
iwslt = get_stat(data)

In [17]:
(iwslt>0).sum().sum()/466

0.9463519313304721

In [18]:
iwslt_norm=iwslt/iwslt.sum().sum()

In [19]:
iwslt_norm.sum().sum()

1.0

In [20]:
iwslt_norm.sum().sum()

1.0

In [21]:
((Caterpillar>0).values*iwslt_norm).sum().sum()

0.730321622422474

In [22]:
((Rainbow>0).values*iwslt_norm).sum().sum()

0.8634506462351713

In [23]:
((Grandfather>0).values*iwslt_norm).sum().sum()

0.7294469418174363

### FITI

In [24]:
FITI_orig = """1. An American action is about to achieve an administration that will embrace organization with authority.
2. The armed attack along the problem zone can establish spread of control.
3. The athletic baseball network can then decide the business in the camp quite firmly.
4. The Chinese approach to bring the assembly growths has hardly aged.
5. The other difficult colleague will perhaps confront my behavior and enjoy it.
6. He did explain, with little enthusiasm, about the effect of the text that we know can transmit information.
7. Did he suggest the result of the changed disease was largely specific to this location?
8. Did congress really forget their major decision today and expect to find no conflict?
9. The genre of organized bureaucracy is increasingly used material but is still harsh work to understand.
10. My Arab friend searched all day for a sandwich in his suburb that has traditional herbs his brother and family want.
11. The author will create a dramatic enough exchange for the class to consider.
12. What great alcohol should we serve this month at the museum, and how much should we get?
13. Can Mr. Snow go capture a photograph of the large sphere and its energy when they both appear into view?
14. Do these political jobs involve negotiation and reflect the platform we researched?
15. Maybe the northern community will produce a global expert to help describe this condition.
16. The monthly challenge that the freshman athlete had to perform never pushed him to fatigue.
17. Whenever the united government is challenged, do the majority of people shrug in frustration?
18. I think their touchdown may threaten our school record, and therefore their triumph is important.
19. The general insurance company sometimes has say in the formula they use for drug sales.
20. How in the world is this healthy herb involved in research that turns its development into a push for policy?
21. For months, we watched the congressional machine that serves this country discuss drugs.
22. What are the odds his girlfriend would agree to try golf at the hotel this year?
23. A strong job interview experience actually breathes life into my morning.
24. The teacher will develop the subject of language patterns within financial management to include in our future program.
25. His very public remark about the establishment and their search for technology could make him look concerned.
26. Simply put this treatment around your arm, forehead, and shoulder and let it absorb.
27. The singer usually only gives one small and one big performance in a single afternoon.
28. I believe that in theory, this version of the graduation service can always continue.
29. An increase in garage depth would provide exactly the right place to keep the hardware.
30. This theoretical question would engage the student and turn him away from a number of others.
31. He won’t escape judgment; a screen for the genetic pattern will emerge and become legal fact.
32. Did you happen to speak to the girl as her birthday approached, to address how different it will be?
33. As a viewer, my observation is that there is no market left even here for a variety show.
34. As president of the movement, if he can remain in office this whole period, he gets an executive driver.
35. The image is of a woman at an age just as she finds her own individual voice.
36. We can begin to breathe once the system can receive and also return good water.
37. The thing is, if you give him a minute, the child can throw in time to the music."""

FITI_orig = " ".join([sent[3:] for sent in FITI_orig.split("\n")])
FITI_orig

'An American action is about to achieve an administration that will embrace organization with authority. The armed attack along the problem zone can establish spread of control. The athletic baseball network can then decide the business in the camp quite firmly. The Chinese approach to bring the assembly growths has hardly aged. The other difficult colleague will perhaps confront my behavior and enjoy it. He did explain, with little enthusiasm, about the effect of the text that we know can transmit information. Did he suggest the result of the changed disease was largely specific to this location? Did congress really forget their major decision today and expect to find no conflict? The genre of organized bureaucracy is increasingly used material but is still harsh work to understand.  My Arab friend searched all day for a sandwich in his suburb that has traditional herbs his brother and family want.  The author will create a dramatic enough exchange for the class to consider.  What gre

In [25]:
import re
FITI_phonemes = """1.	ən əˈmɛrəkən ˈækʃən ɪz əˈbaʊt tu əˈʧiv ən ædˌmɪnɪˈstreɪʃən ðæt wɪl ɛmˈbreɪs ˌɔrɡənəˈzeɪʃən wɪð əˈθɔrəti.
2.	ði ɑrmd əˈtæk əˈlɔŋ ðə ˈprɑbləm zoʊn kæn ɪˈstæblɪʃ sprɛd ʌv kənˈtroʊl.
3.	ði æˈθlɛtɪk ˈbeɪsˈbɔl ˈnɛˌtwɜrk kæn ðɛn ˌdɪˈsaɪd ðə ˈbɪznəs ɪn ðə kæmp kwaɪt ˈfɜrmli.
4.	ðə ʧaɪˈniz əˈproʊʧ tu brɪŋ ði əˈsɛmbli ɡroʊθs hæz ˈhɑrdli eɪʤd.
5.	ði ˈʌðər ˈdɪfəkəlt ˈkɑliɡ wɪl pərˈhæps kənˈfrʌnt maɪ bɪˈheɪvjər ænd ɛnˈʤɔɪ ɪt.
6.	hi dɪd ɪkˈspleɪn, wɪð ˈlɪtəl ɪnˈθuziˌæzəm, əˈbaʊt ði ɪˈfɛkt ʌv ðə tɛkst ðæt wi noʊ kæn trænzˈmɪt ˌɪnfərˈmeɪʃən.
7.	dɪd hi səɡˈʤɛst ðə rɪˈzʌlt ʌv ðə ʧeɪnʤd dɪˈziz wʌz ˈlɑrʤli spəˈsɪfɪk tu ðɪs loʊˈkeɪʃən?
8.	dɪd ˈkɑŋɡrəs ˈrɪli fərˈɡɛt ðɛr ˈmeɪʤər dɪˈsɪʒən təˈdeɪ ænd ɪkˈspɛkt tu faɪnd noʊ ˈkɑnflɪkt?
9.	ðə ˈʒɑnrə ʌv ˈɔrɡəˌnaɪzd bjʊˈrɑkrəsi ɪz ɪnˈkrisɪŋli juzd məˈtɪriəl bʌt ɪz stɪl hɑrʃ wɜrk tu ˌʌndərˈstænd.
10.	maɪ ˈærəb frɛnd sɜrʧt ɔl deɪ fɔr ə ˈsændwɪʧ ɪn hɪz ˈsʌbərb ðæt hæz trəˈdɪʃənəl ɜrbz hɪz ˈbrʌðər ænd ˈfæməli wɑnt.
11.	ði ˈɔθər wɪl kriˈeɪt ə drəˈmætɪk ɪˈnʌf ɪksˈʧeɪnʤ fɔr ðə klæs tu kənˈsɪdər.
12.	wɑt ɡreɪt ˈælkəˌhɑl ʃʊd wi sɜrv ðɪs mʌnθ æt ðə mjuˈziəm, ænd haʊ mʌʧ ʃʊd wi ɡɛt?
13.	kæn ˈmɪstər. snoʊ ɡoʊ ˈkæpʧər ə ˈfoʊtəˌɡræf ʌv ðə lɑrʤ sfɪr ænd ɪts ˈɛnərʤi wɛn ðeɪ boʊθ əˈpɪr ˈɪntu vju?
14.	du ðiz pəˈlɪtəkəl ʤɑbz ɪnˈvɑlv nɪˌɡoʊʃiˈeɪʃən ænd rəˈflɛkt ðə ˈplætˌfɔrm wi riˈsɜrʧt?
15.	ˈmeɪbi ðə ˈnɔrðərn kəmˈjunəti wɪl prəˈdus ə ˈɡloʊbəl ˈɛkspərt tu hɛlp dɪˈskraɪb ðɪs kənˈdɪʃən.
16.	ðə ˈmʌnθli ˈʧælənʤ ðæt ðə ˈfrɛʃmən ˈæˌθlit hæd tu pərˈfɔrm ˈnɛvər pʊʃt hɪm tu fəˈtiɡ.
17.	wɛˈnɛvər ðə juˈnaɪtəd ˈɡʌvərmənt ɪz ˈʧælənʤd, du ðə məˈʤɔrəti ʌv ˈpipəl ʃrʌɡ ɪn frəˈstreɪʃən?
18.	aɪ θɪŋk ðɛr ˈtʌʧˌdaʊn meɪ ˈθrɛtən ˈaʊər skul ˈrɛkərd, ænd ˈðɛrˌfɔr ðɛr ˈtraɪəmf ɪz ɪmˈpɔrtənt.
19.	ðə ˈʤɛnərəl ɪnˈʃʊrəns ˈkʌmpəni səmˈtaɪmz hæz seɪ ɪn ðə ˈfɔrmjələ ðeɪ juz fɔr drʌɡ seɪlz.
20.	haʊ ɪn ðə wɜrld ɪz ðɪs ˈhɛlθi ɜrb ɪnˈvɑlvd ɪn riˈsɜrʧ ðæt tɜrnz ɪts dɪˈvɛləpmənt ˈɪntu ə pʊʃ fɔr ˈpɑləsi?
21.	fɔr mʌnθs wi wɑʧt ðə kənˈɡrɛʃənəl məˈʃin ðæt sɜrvz ðɪs ˈkʌntri dɪˈskʌs drʌɡz.
22.	wɑt ɑr ði ɑdz hɪz ˈɡɜrlˌfrɛnd wʊd əˈɡri tu traɪ ɡɑlf æt ðə hoʊˈtɛl ðɪs jɪr?
23.	ə strɔŋ ʤɑb ˈɪntərˌvju ɪkˈspɪriəns ˈækʧuəli briðz laɪf ˈɪntu maɪ ˈmɔrnɪŋ.
24.	ðə ˈtiʧər wɪl dɪˈvɛləp ðə ˈsʌbʤɪkt ʌv ˈlæŋɡwəʤ ˈpætərnz wɪˈðɪn fəˈnænʃəl ˈmænəʤmənt tu ɪnˈklud ɪn ˈaʊər ˈfjuʧər ˈproʊˌɡræm.
25.	hɪz ˈvɛri ˈpʌblɪk rɪˈmɑrk əˈbaʊt ði ɪˈstæblɪʃmənt ænd ðɛr sɜrʧ fɔr tɛkˈnɑləʤi kʊd meɪk hɪm lʊk kənˈsɜrnd.
26.	ˈsɪmpli pʊt ðɪs ˈtritmənt əˈraʊnd jʊər ɑrm, ˈfɔrhɛd, ænd ˈʃoʊldər ænd lɛt ɪt əbˈzɔrb.
27.	ðə ˈsɪŋər ˈjuʒəwəli ˈoʊnli ɡɪvz wʌn smɔl ænd wʌn bɪɡ pərˈfɔrməns ɪn ə ˈsɪŋɡəl ˌæftərˈnun.
28.	aɪ bɪˈliv ðæt ɪn ˈθiəri, ðɪs ˈvɜrʒən ʌv ðə ˌgræʤuˈeɪʃən ˈsɜrvəs kæn ˈɔlˌweɪz kənˈtɪnju.
29.	ən ˈɪnˌkris ɪn ɡəˈrɑʒ dɛpθ wʊd prəˈvaɪd ɪɡˈzæktli ðə raɪt pleɪs tu kip ðə ˈhɑrˌdwɛr.
30.	ðɪs ˌθiəˈrɛtɪkəl ˈkwɛsʧən wʊd ɛnˈɡeɪʤ ðə ˈstudənt ænd tɜrn hɪm əˈweɪ frʌm ə ˈnʌmbər ʌv ˈʌðərz.
31.	hi woʊnt ɪˈskeɪp ˈʤʌʤmənt; ə skrin fɔr ðə ʤəˈnɛtɪk ˈpætərn wɪl ɪˈmɜrʤ ænd bɪˈkʌm ˈliɡəl fækt.
32.	dɪd ju ˈhæpən tu spik tu ðə ɡɜrl æz hɜr ˈbɜrθˌdeɪ əˈproʊʧt, tu ˈæˌdrɛs haʊ ˈdɪfərənt ɪt wɪl bi?
33.	æz ə ˈvjuər, maɪ ˌɑbzərˈveɪʃən ɪz ðæt ðɛr ɪz noʊ ˈmɑrkət lɛft ˈivɪn hir fɔr ə vəˈraɪəti ʃoʊ.
34.	æz ˈprɛzəˌdɛnt ʌv ðə ˈmuvmənt, ɪf hi kæn rɪˈmeɪn ɪn ˈɔfəs ðɪs hoʊl ˈpɪriəd, hi ɡɛts ən ɪɡˈzɛkjətɪv ˈdraɪvər.
35.	ði ˈɪməʤ ɪz ʌv ə ˈwʊmən æt ən eɪʤ ʤʌst æz ʃi faɪndz hɜr oʊn ˌɪndəˈvɪʤəwəl vɔɪs.
36.	wi kæn bɪˈɡɪn tu brið wʌns ðə ˈsɪstəm kæn rəˈsiv ænd ˈɔlsoʊ rɪˈtɜrn ɡʊd ˈwɔtər.
37.	ðə θɪŋ ɪz, ɪf ju ɡɪv hɪm ə ˈmɪnət, ðə ʧaɪld kæn θroʊ ɪn taɪm tu ðə ˈmjuzɪk.
"""
FITI_phonemes = re.sub(r"\d+.\t","", FITI_phonemes)
FITI_phonemes = FITI_phonemes.replace("\n", "")
stop_symbols = ['?',',','.','!','"', '”','“', ";"]
for symb in stop_symbols:
    FITI_phonemes = FITI_phonemes.replace(symb, ' ')
FITI_phonemes = FITI_phonemes.replace("ɡ","g")   

data = [syllabify(phoneme) for phoneme in FITI_phonemes.split()]
FITI_phonemes_df = get_stat(data)
FITI_phonemes_df

Unnamed: 0,initial_mono_v,initial_multi_stressed_v,initial_multi_unstressed_v,initial_mono_c,initial_multi_stressed_c,initial_multi_unstressed_c,final_mono_v,final_multi_stressed_v,final_multi_unstressed_v,final_mono_c,final_multi_stressed_c,final_multi_unstressed_c,medial_mono_v_c,medial_multi_stressed_v_c,medial_multi_unstressed_v_c,medial_mono_c_v,medial_multi_stressed_c_v,medial_multi_unstressed_c_v,medial_multi_stressed_v_v,medial_multi_unstressed_v_v,medial_mono_c_c,medial_multi_stressed_c_c,medial_multi_unstressed_c_c
p,3.0,6.0,4.0,1.0,4.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,4.0,1.0,1.0,3.0,3.0,1,2,1.0,1.0,1.0
b,4.0,3.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,0.0,1.0,1.0,3,3,1.0,1.0,1.0
t,21.0,2.0,2.0,1.0,2.0,2.0,23.0,6.0,3.0,9.0,8.0,15.0,3.0,2.0,1.0,1.0,6.0,8.0,5,17,1.0,4.0,1.0
d,8.0,2.0,7.0,2.0,1.0,1.0,13.0,3.0,3.0,22.0,4.0,4.0,1.0,0.0,2.0,0.0,1.0,5.0,3,3,1.0,1.0,2.0
k,12.0,6.0,8.0,2.0,1.0,1.0,3.0,1.0,6.0,2.0,1.0,1.0,2.0,7.0,9.0,1.0,2.0,2.0,2,5,1.0,3.0,1.0
g,8.0,2.0,1.0,2.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,2.0,3.0,1,2,0.0,1.0,2.0
f,11.0,5.0,3.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1,4,0.0,1.0,2.0
v,1.0,2.0,1.0,1.0,1.0,0.0,12.0,3.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,3.0,1.0,4,5,1.0,1.0,1.0
θ,2.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1,1,1.0,2.0,0.0
ð,72.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,3,0.0,0.0,0.0


In [26]:
((FITI_phonemes_df>0).sum().sum())/466

0.8133047210300429

In [27]:
((FITI_phonemes_df>0).values*iwslt_norm).sum().sum()

0.9972195808353376