In [None]:
import json
import pandas as pd 
from itertools import islice
from heapq import nlargest
from shared import *          #shared functions from shared.py

# Bi-gram dictionary

In [None]:
def read_bi_gram():
    """store bi-gram as a list of lists of three elements: word, 
    the following word and probability of occurance of the following word"""
    
    bi_gram=[]
    with open('../lm_unpruned', 'r') as f:
        for line in islice(f, 57292, 1380576, 1): #read in only bi-grams 
            if line.split(' ')[1] not in ('<unk>', '</s>') and '_' not in line.split(' ')[1] and line.split(' ')[2].replace("\n", '') not in ('<unk>', '</s>') and '_' not in line.split(' ')[2].replace("\n", ''):
                bi_gram.append([float(line.split(' ')[0]), line.split(' ')[1], line.split(' ')[2].replace("\n", '')])
    return bi_gram

bi_gram_list = read_bi_gram()

In [None]:
def create_bi_gram_dict(bi_gram_list):
    """create bi-gram dictionary based on the list of bi-grams"""
    
    bi_gram_dict = {}
    for i in range(len(bi_gram_list)): 
        if bi_gram_list[i][1] in bi_gram_dict.keys():  
            bi_gram_dict[bi_gram_list[i][1]].append([bi_gram_list[i][0], bi_gram_list[i][2]])  
        else:  
            bi_gram_dict[bi_gram_list[i][1]] = [[bi_gram_list[i][0], bi_gram_list[i][2]]]    
    return bi_gram_dict   

bi_gram_dict = create_bi_gram_dict(bi_gram_list)

# Insertion functions

In [None]:
def insert_words(transcript, rate):
    """insert words into the transcript at a given rate from bi-gram dictionary (uni-gram list if word is not in bi-gram), 
    return the new transcript and a list of inserted words"""
    
    transcript = json.loads(transcript)
    to_insert = random_words_list(flatten(transcript), rate)
    inserted_words = []        
    try:
        while 0 != (len(to_insert)):
            
            for sublist in transcript: 
                for element in sublist['tokens']:                
                    if element['type'] not in('REF', 'INS', 'INS_SEC', 'RND'):      #avoid manipulating words that were already altered i.e. inserted or inserted after                     
                        if to_insert[0] == element['value']:                             
                            to_insert.remove(to_insert[0])                  #remove word from the list, the next element becomes index 0 and will be looked at once this loop is complete              
                            if element['value'].lower() in bi_gram_dict:
                                if len(bi_gram_dict[element['value'].lower()]) > 1:         #check that bi-gram key has more than one value    
                                    first_max, second_max = nlargest(2, bi_gram_dict[element['value'].lower()])                #store two words fist_max/second_max that are more likely to occure according to bi-gram dictionary
                                    if sublist['tokens'].index(element) == len(sublist['tokens'])-1:            #if the word after wich we need to insert is the last in the token, insert first_max                                                                            
                                        sublist['tokens'].insert(sublist['tokens'].index(element)+1, {'type': 'INS', 'value': first_max[1]})                  #change 'type' to INS so the word is not used as a reference for insertion in future loops
                                        element['type'] = 'REF'             #change 'type' of the word that was used as a reference for insertion so not to use it for other insertions
                                        inserted_words.append({'type': 'word', 'value': first_max[1]})                                           
                                    else:
                                        if first_max[1] != sublist['tokens'][sublist['tokens'].index(element)+1]['value'].lower():      #check if the first_max from bi-gram is the same as the word following the word that we use as a reference for insertion                                          
                                            sublist['tokens'].insert(sublist['tokens'].index(element)+1, {'type': 'INS', 'value': first_max[1]})
                                            element['type'] = 'REF'   
                                            inserted_words.append({'type': 'word', 'value': first_max[1]})  
                                            
                                        else:                                                                  #insert second_max, second most probable word from bi-gram dict                                                                                  
                                            sublist['tokens'].insert(sublist['tokens'].index(element)+1, {'type': 'INS_SEC', 'value': second_max[1]})
                                            element['type'] = 'REF'
                                            inserted_words.append({'type': 'word', 'value': second_max[1]})                                          
                                else:                                                                    
                                    sublist['tokens'].insert(sublist['tokens'].index(element)+1, {'type': 'INS', 'value': max(bi_gram_dict[element['value'].lower()])[1]})
                                    element['type'] = 'REF'    
                                    inserted_words.append({'type': 'word', 'value': max(bi_gram_dict[element['value'].lower()])[1]})                                  
                            else:                             
                                subst_w = random.choice(one_gram_list)[1]                             #if the word not in bi-gram use insert a random word from uni-gram
                                sublist['tokens'].insert(sublist['tokens'].index(element)+1, {'type': 'RND', 'value': subst_w})
                                element['type'] = 'REF'
                                inserted_words.append({'type': 'word', 'value': subst_w})                              
    except:
        pass

    for sublist in transcript: 
        for element in sublist['tokens']:
            if element['type'] in('REF', 'INS', 'INS_SEC', 'RND'):
                element['type'] = 'word'
    return json.dumps(transcript), inserted_words


In [None]:
df = pd.read_csv('../ASRforAD.csv')

#insert % of words into json manual transcript for all rows in dataframe
df = df.merge(df.json_utterances_man.apply(lambda s: pd.Series(insert_words(s, 0.2))), left_index=True, right_index=True)       
df.rename(columns = {0:'json_utterances_man_with_INSERTED_WORDS_20%', 1:'INSERTED_WORDS_20%'}, inplace =True )         

#output csv with altered manual transcript and inserted words as new columns 
df.to_csv('../INSERTION_ASRforAD.csv')

df.head()