In [77]:
import re
import random
import numpy as np

In [2]:
with open('data/rnn-mm-novel.txt', 'r') as handle:
    sentences = [l.replace(' ', '').replace('\n', '').replace('”', '').replace(
        '“', '').replace('\ufeff', '') for l in handle.readlines()]
print(len(sentences))
print(sentences[:10])

1603
['ကိုကို', 'ဘာလဲ', 'ကိုကို့ဟာကြီးခဏပိတ်လိုက်စမ်းပါ', 'ဘာဖြစ်လို့ပိတ်ရမှာလဲ', 'ထွဋ်ဘုန်းဇော်သည်ဗွီဒီယိုကြည့်နေရာမှညီမသရဖီဘုန်းဇော်အားပြန်အော်လိုက်သည်။', 'ခဏလေးပါကိုကိုရာ…တစ်မိနစ်လောက်ပါ', 'အရေးတကြီးဘာဖြစ်နေရတာလဲ', 'ထွဋ်ဘုန်းဇော်သည်တီဗွီကိုပိတ်ပေးလိုက်သည်။သူ့အောက်ရှစ်နှစ်ခန့်ငယ်သောညီမထွေးလေးဖြစ်သဖြင့်သရဖီဘုန်းဇော်၏ဆန္ဒကိုမငြင်းပယ်လိုပေ။', 'ဟော', 'သရဖီဘုန်းဇော်၏မျက်လုံးဝိုင်းလေးများသည်ဝင်းလက်သွားပြီးပြတင်းပေါက်ဆီဘက်သို့ပြေးသွားသည်။']


In [109]:
def generate_errors(sentences=['သရဖီဘုန်းဇော်၏မျက်လုံးဝိုင်းလေးများသည်ဝင်းလက်သွားပြီးပြတင်းပေါက်ဆီဘက်သို့ပြေးသွားသည်။'],
                    error=True, omission=True, repetition=True,
                    misspell_proba=0.25, omission_proba=0.125, repetition_proba=0.125,
                    verbose=False):
    # there are currently 3 types of errors
    # misspelled errors: ပ်  က်  တ်, ခင်ဗျား ခများ
    # error by omission: း, ေကာင်
    # repetition: းးးးးးးး

    result = list()

    if False:
        error_file = './data/spelling-errors/errors.mm.txt'
        misspelled = {}
        with open(error_file, 'r') as handle:
            for line in handle:
                elements = re.split(r'[ ]+', line.replace('\n', ''))
                misspelled[elements[0]] = elements[1:]

    omission_file = './data/spelling-errors/omissions.mm.txt'
    with open(omission_file, 'r') as handle:
        omissions = [l.replace('\n', '') for l in handle.readlines()]

    repetition_file = './data/spelling-errors/repetitions.mm.txt'
    with open(repetition_file, 'r') as handle:
        repetitions = [l.replace('\n', '') for l in handle.readlines()]

    for sentence in sentences:
        print('Original Sentence:')
        print(sentence)
        if error:
            # misspelled
            misspelled = {}
            misspell_match_count = 0
            misspell_noise_count = 0
            for err in list(misspelled.keys()):
                if err in sentence:
                    if verbose:
                        print('Match found:', err)
                    misspell_match_count += 1
                    if random.random() < misspell_proba:
                        misspell_noise_count += 1
                        error_choice = random.randint(
                            0, len(misspelled[err])-1)
                        if verbose:
                            print('Replacing {} with {}'.format(
                                misspelled[err][error_choice], err))
                        sentence = sentence.replace(
                            err, misspelled[err][error_choice])
            print('*'*100)
            print('Misspell matches:', misspell_match_count)
            print('Misspell noise added:', misspell_noise_count)
            if misspell_noise_count > 0:
                print(sentence)
            print('*'*100)

        if omission:
            # omissions
            omission_match_count = 0
            omission_noise_count = 0
            for om in omissions:
                if om in sentence:
                    # if the character is in the sentence,
                    # loop all appearances and remove with probability
                    match_indices = np.where(om == np.array(list(sentence)))[0]
                    omission_match_count += match_indices.shape[0]
                    for indx in match_indices:
                        if random.random() < omission_proba:
                            if verbose:
                                print('Removing:', om)
                            # replace with a space to preserve length
                            sentence = sentence[:indx] + \
                                ' ' + sentence[indx+1:]
                            omission_noise_count += 1
            sentence = sentence.replace(' ', '')
            print('*'*100)
            print('Omission matches:', omission_match_count)
            print('Omission noise added:', omission_noise_count)
            if omission_noise_count > 0:
                print(sentence)
            print('*'*100)

        if repetition:
            # repetitions
            repetition_match_count = 0
            repetition_noise_count = 0
            for ch in repetitions:
                if ch in sentence:
                    # if the character is in the sentence,
                    # loop all appearances and repeat with probability
                    match_indices = np.where(ch == np.array(list(sentence)))[0]
                    repetition_match_count += match_indices.shape[0]
                    for indx in match_indices:
                        if random.random() < repetition_proba:
                            if verbose:
                                print('Repeating:', ch)
                            # add 1 to indices for adding a new character
                            sentence = sentence[:indx] + ch + sentence[indx:]
                            repetition_noise_count += 1
            print('*'*100)
            print('Repetition matches:', repetition_match_count)
            print('Repetition noise added:', repetition_noise_count)
            if repetition_noise_count > 0:
                print(sentence)            
            print('*'*100)
        result.append(sentence)
    return result

In [108]:
x = generate_errors(sentences=sentences[:5],verbose=False)
x

Original Sentence:
ကိုကို
****************************************************************************************************
Misspell matches: 0
Misspell noise added: 0
****************************************************************************************************
****************************************************************************************************
Omission matches: 0
Omission noise added: 0
****************************************************************************************************
****************************************************************************************************
Repetition matches: 4
Repetition noise added: 0
****************************************************************************************************
Original Sentence:
ဘာလဲ
****************************************************************************************************
Misspell matches: 0
Misspell noise added: 0
***************************************************************

['ကိုကို',
 'ဘာလဲ',
 'ကိိုကိုု့ဟာကြီီးးခဏပိတ်လိုက်စမ်းပါ',
 'ဘာဖြစ်လို့ပိတ်ရမှှာလဲ',
 'ထွွဋ်ဘုန််းဇော်သည်ဗွီဒီယိုကြြည်နေရာမှှညီမသရဖီဘုန်းဇောာ်အားပြ်န်အော်လိုက်သည်။']