In [None]:
import pandas as pd
import nltk
import ast
from itertools import islice
from shared import *

In [None]:
#read in csv file, substitute % of words from json manual transcript with words that have shortest Levenstein distance, output csv with altered manual transcript and list of words that were substituted as new columns
df = pd.read_csv('../ASRforAD.csv')

df = df.merge(df.json_utterances_man.apply(lambda s: pd.Series(subst_words(s, 0.2))), left_index=True, right_index=True)
df.rename(columns = {0:'json_utterances_man_with_SUBSTITUTED_WORDS_20%', 1:'SUBSTITUTED_WORDS_20%'}, inplace =True )

df.to_csv('../SUBSTITUTION_ASRforAD.csv')

print(df.head())

In [None]:
#function to substitute the words in transcript. Substitute words that are in the random substitution list with the values from the tr_unigram_dictionary 
def subst_words(transcript, rate):
    transcript = store_tr(transcript)
    words_to_sub = random_words_list(flatten(transcript), rate)
    substituted_words =[]
    try:
        while 0 != (len(words_to_sub)):
            for sublist in transcript:                 
                for element in sublist['tokens']:
                    if words_to_sub[0] == element['value']: 
                        substituted_words.append(element)
                        if words_to_sub[0] in tr_unigrm_dict.keys():
                            element['value'] = tr_unigrm_dict[words_to_sub[0]['value']][0]
                        else:
                            element['value'] = random.choice(one_gram_list)[1]  # substitute with a random word from unigram
                        words_to_sub.remove(words_to_sub[0])
    except:
        pass
    return json.dumps(transcript), substituted_words

# Phonemic Similarity
From here: [https://stackoverflow.com/questions/26474847/estimate-phonemic-similarity-between-two-words]
You can view the file here: [http://www.greenteapress.com/thinkpython/code/c06d?fbclid=IwAR3kK8u0l48ksaGi8v60FZLDsSjpdjhw3dCCeZdRDS0VkBhgeR5YyzSUTuI]

In [None]:
#create phonemic dictionary
def create_cmu_sound_dict():
    cmu_final_sound_dict = {}
    with open('../c06d') as cmu_dict:
        cmu_dict = cmu_dict.read().split("\n")
        for i in cmu_dict:
            i_s = i.split()
            if len(i_s) > 1:
                word = i_s[0]
                syllables = i_s[1:]
            cmu_final_sound_dict[word.lower()] = " ".join(syllables)
    return cmu_final_sound_dict

phonemic_model = create_cmu_sound_dict()

# Transcript Dictionary
Create a dictionary for all words in transcript that contains a corresponding word with smallest Levenstein distance as well as distance itself

In [None]:
#read in unigrams from all the transcripts into a unique list
def create_tr_unigrm():
    tr_unigrm_unique = []
    with open('../tr_unigrams.txt', encoding='utf-8-sig') as tr_unigrm:
        tr_unigrm = tr_unigrm.read().replace('"','').replace('\\\\n', '').replace("\\\\t", "")
        tr_unigrm = tr_unigrm.split(", ")
        for i in range(len(tr_unigrm)):
            if tr_unigrm[i] not in tr_unigrm_unique:
                tr_unigrm_unique.append(tr_unigrm[i])
    return tr_unigrm_unique

tr_unigrm = create_tr_unigrm()

In [None]:
#create a dictionary for all the unique unigrams from all the transcripts unigram dictionary. Dictionary will contain corresponding word with a minimum Levenstein distance as well as the distance itself
def create_tr_unigram_dict():
    tr_unigrm_dict = {}
    for j in range(len(tr_unigrm)):
        if tr_unigrm[j] not in phonemic_model:  #if  unigram from transcript dictionary not in phonemic model return a random value from the reduced unigram list in shared file
            tr_unigrm_dict[tr_unigrm[j]] = [random.choice(reduced_one_gram)[1], -1]   #set distance to -1      
        else:                
            temp_sub = []
            temp_dist = []
            for i in range(len(reduced_one_gram)):
                if tr_unigrm[j] != reduced_one_gram[i][1]:
                    if reduced_one_gram[i][1] in phonemic_model: 
                        temp_dist.append(nltk.edit_distance(phonemic_model[tr_unigrm[j]], phonemic_model[reduced_one_gram[i][1]], transpositions = False))
                        temp_sub.append(reduced_one_gram[i][1])
            tr_unigrm_dict[tr_unigrm[j]] = [temp_sub[temp_dist.index(min(temp_dist))], min(temp_dist)]
    return tr_unigrm_dict

tr_unigrm_dict = create_tr_unigram_dict()