In [1]:
import json
import nltk
import pandas as pd
from itertools import islice
from shared import *            #shared functions from shared.py

# Phonemic Similarity
From here: [https://stackoverflow.com/questions/26474847/estimate-phonemic-similarity-between-two-words] 

Web browser view: [http://www.greenteapress.com/thinkpython/code/c06d?fbclid=IwAR3kK8u0l48ksaGi8v60FZLDsSjpdjhw3dCCeZdRDS0VkBhgeR5YyzSUTuI]

In [2]:
def create_cmu_sound_dict():
    """create a phonemic dictionary"""
    
    cmu_final_sound_dict = {}
    with open('../c06d') as cmu_dict:
        cmu_dict = cmu_dict.read().split("\n")
        for i in cmu_dict:
            i_s = i.split()
            if len(i_s) > 1:
                word = i_s[0]
                syllables = i_s[1:]
            cmu_final_sound_dict[word.lower()] = " ".join(syllables)
    return cmu_final_sound_dict

phonemic_model = create_cmu_sound_dict()

# Transcript Dictionary
Create a dictionary for all words in transcript that contains a corresponding word with smallest Levenstein distance as well as distance itself

In [3]:
def create_tr_unigrm():
    """read in words from all the transcripts into a unique unigram list"""
    
    tr_unigrm_unique = []
    with open('../tr_unigrams.txt', encoding='utf-8-sig') as tr_unigrm:
        tr_unigrm = tr_unigrm.read().replace('"','').replace('\\\\n', '').replace("\\\\t", "")
        tr_unigrm = tr_unigrm.split(", ")
        for i in range(len(tr_unigrm)):
            if tr_unigrm[i] not in tr_unigrm_unique:
                tr_unigrm_unique.append(tr_unigrm[i])
    return tr_unigrm_unique

tr_unigrm = create_tr_unigrm()

In [4]:
def create_tr_unigram_dict():
    """create a dictionary for all the unique unigrams from the tr_unigram list. 
    Dictionary will contain words as keys and values will have a list of the following elements: 
    a corresponding word that has a minimum Levenstein distance as well as the distance itself"""
    
    tr_unigrm_dict = {}
    for j in range(len(tr_unigrm)):
        if tr_unigrm[j] not in phonemic_model:  #if  unigram from transcript dictionary not in phonemic model return a random value from the reduced unigram list in shared file
            tr_unigrm_dict[tr_unigrm[j]] = [random.choice(reduced_one_gram)[1], -1]   #set distance to -1      
        else:                
            temp_sub = []
            temp_dist = []
            for i in range(len(reduced_one_gram)):
                if tr_unigrm[j] != reduced_one_gram[i][1]:
                    if reduced_one_gram[i][1] in phonemic_model: 
                        temp_dist.append(nltk.edit_distance(phonemic_model[tr_unigrm[j]], phonemic_model[reduced_one_gram[i][1]], transpositions = False))
                        temp_sub.append(reduced_one_gram[i][1])
            tr_unigrm_dict[tr_unigrm[j]] = [temp_sub[temp_dist.index(min(temp_dist))], min(temp_dist)]
    return tr_unigrm_dict

tr_unigrm_dict = create_tr_unigram_dict()

# Substitution Functions

In [96]:
def subst_words(transcript, rate):
    """substitute words in transcript at a given rate.
    Return the new transcript and list of substituted words"""
    
    transcript = json.loads(transcript)
    words_to_sub = random_words_list(flatten(transcript), rate)
    substituted_words =[]
    try:
        while 0 != (len(words_to_sub)):
            for sublist in transcript:                 
                for element in sublist['tokens']:
                     if element['type'] not in('SUB','RND'):      #avoid manipulating words that were already altered i.e. substituted                   
                        if words_to_sub[0] == element['value']: 
                            substituted_words.append(element)                            
                            if words_to_sub[0] in tr_unigrm_dict.keys():                                
                                element['value'] = tr_unigrm_dict[words_to_sub[0]][0]
                                element['type'] = 'SUB'
                            else:
                                element['value'] = random.choice(one_gram_list)[1]  # substitute with a random word from unigram
                                element['type'] = 'RND'
                            words_to_sub.remove(words_to_sub[0])
                            
    except:
        pass
    
    for sublist in transcript: 
        for element in sublist['tokens']:
            if element['type'] in('SUB','RND'):
                element['type'] = 'word'
                
    return json.dumps(transcript), substituted_words

In [97]:
df = pd.read_csv('../ASRforAD.csv')

#substitute % of words from json manual transcript with words that have shortest Levenstein distance
df = df.merge(df.json_utterances_man.apply(lambda s: pd.Series(subst_words(s, 0.2))), left_index=True, right_index=True)
df.rename(columns = {0:'json_utterances_man_with_SUBSTITUTED_WORDS_20%', 1:'SUBSTITUTED_WORDS_20%'}, inplace =True )

#output csv with altered manual transcript and list of words that were substituted as new columns
df.to_csv('../SUBSTITUTION_ASRforAD.csv')



Unnamed: 0,report_id,diarisation_id,json_utterances_man,json_utterances_asr,transcript_manual,json_utterances_man_with_SUBSTITUTED_WORDS_20%,SUBSTITUTED_WORDS_20%
0,1084,1080,"[{""tokens"": [{""type"": ""word"", ""value"": ""thank""...",,thank you xxx xxx boston legal at sunday night...,"[{""tokens"": [{""type"": ""word"", ""value"": ""thank""...","[{'type': 'word', 'value': 'my'}, {'type': 'wo..."
1,1049,1045,"[{""tokens"": [{""type"": ""word"", ""value"": ""yes""},...",,yes i read moby dick while i was making the mo...,"[{""tokens"": [{""type"": ""word"", ""value"": ""yes""},...","[{'type': 'word', 'value': 'no'}, {'type': 'wo..."
2,1103,1099,"[{""tokens"": [{""type"": ""word"", ""value"": ""I'm""},...","[{""tokens"": [{""confidence"": 1.0, ""type"": ""unfi...",i'm b smith and i suffer from alzheimer diseas...,"[{""tokens"": [{""type"": ""word"", ""value"": ""ponies...","[{'type': 'word', 'value': 'so'}, {'type': 'wo..."
3,400,396,"[{""tokens"": [{""type"": ""word"", ""value"": ""we're""...","[{""tokens"": [{""confidence"": 1.0, ""type"": ""unfi...",we're going to get to an ophthalmologist real ...,"[{""tokens"": [{""type"": ""word"", ""value"": ""docks""...","[{'type': 'word', 'value': 'there'}, {'type': ..."
4,1015,1011,"[{""tokens"": [{""type"": ""word"", ""value"": ""so""}, ...","[{""tokens"": [{""confidence"": 1.0, ""type"": ""unfi...",,"[{""tokens"": [{""type"": ""word"", ""value"": ""no""}, ...","[{'type': 'word', 'value': 'wanting'}, {'type'..."
