In [1]:
import json
import os
import re
import random
from tqdm import tqdm
import itertools

import pandas as pd
import numpy as np

from g2pk import G2p
from levenshtein_finder import LevenshteinFinder , CharacterTokenizer , Normalizers

In [3]:
df = pd.read_csv('../../data/raw/aihub_2022_colloquial_enko.csv')

In [6]:
df = df[['tgt']]
df.to_csv('../../data/raw/colloquial_ko.csv')

In [2]:
random.seed(825)
np.random.seed(826)

In [3]:
def clean_korean(df:pd.DataFrame):
    ko_regex = r'[^ㄱ-힣|\.|\s]'
    only_ko_list = [i for i in range(len(df)) if len(re.findall(ko_regex, df['tgt'][i]))==0]
    only_ko = df.loc[only_ko_list]
    only_ko.reset_index(inplace=True, drop=True)
    return only_ko

In [4]:
kor_begin = 44032
kor_end = 55203

chosung_base = 588
jungsung_base = 28

jaum_begin = 12593
jaum_end = 12622

moum_begin = 12623
moum_end = 12643

chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 
        'ㅅ', 'ㅆ', 'ㅇ' , 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jungsung_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 
        'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 
        'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 
        'ㅡ', 'ㅢ', 'ㅣ', ' ']

jongsung_list = [
    ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ',
        'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 
        'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 
        'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jaum_list = ['ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 
              'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 
              'ㅃ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

moum_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']

def compose(chosung, jungsung, jongsung):
    # 자음 모음 결합
    char = chr(
        kor_begin +
        chosung_base * chosung_list.index(chosung) +
        jungsung_base * jungsung_list.index(jungsung) +
        jongsung_list.index(jongsung)
    )
    return char

def decompose(c):
    # 자음 모음 분해
    if not character_is_korean(c):
        return None
    i = ord(c)
    if (jaum_begin <= i <= jaum_end):
        return (c, ' ', ' ')
    if (moum_begin <= i <= moum_end):
        return (' ', c, ' ')

    # decomposition rule
    i -= kor_begin
    cho  = i // chosung_base
    jung = ( i - cho * chosung_base ) // jungsung_base 
    jong = ( i - cho * chosung_base - jung * jungsung_base )    
    return (chosung_list[cho], jungsung_list[jung], jongsung_list[jong])

def character_is_korean(c):
    i = ord(c)
    return ((kor_begin <= i <= kor_end) or
            (jaum_begin <= i <= jaum_end) or
            (moum_begin <= i <= moum_end))

In [6]:
def jamo_error_data(target_list:list):
    result = []
    for i in range(len(target_list)):
        if len(target_list[i]) >= 3:
            # 띄어쓰기를 기준으로 split
            split_t = target_list[i].split()

            # Select sample number
            sample_num = random.randrange(1,4)

            # Random Choose words
            if len(split_t) <= sample_num:
                sample_num = sample_num - 1
                rand_t = random.sample(range(len(split_t)), sample_num)
            else:
                rand_t = random.sample(range(len(split_t)), sample_num)

            for n in rand_t:
                # 한글 제외 문자 모두 제거
                replace_text = re.sub('[^ㄱ-힣]', '', split_t[n])
                replace_text = replace_text.replace(' ', '')

                t = random.choice(range(len(replace_text)))

                # 자음 모음 분리.
                decompose_text = decompose(replace_text[t])
                decompose_text = list(decompose_text)

                c = random.choice(range(len(decompose_text)))

                if decompose_text[c] in jongsung_list:
                    if c == 0:
                        decompose_text[c] = random.choice(chosung_list)
                        comp = compose(decompose_text[c], decompose_text[1], decompose_text[-1])
                    elif c == 2:
                        decompose_text[c] = random.choice(jongsung_list)
                        comp = compose(decompose_text[0], decompose_text[1], decompose_text[c])

                elif decompose_text[c] in jungsung_list:
                    if len(decompose_text) == 2:
                        decompose_text[c] = random.choice(jungsung_list)
                        comp = compose(decompose_text[0], decompose_text[c])
                    elif len(decompose_text) > 2:
                        decompose_text[c] = random.choice(jungsung_list)
                        comp = compose(decompose_text[0], decompose_text[c], decompose_text[-1])

                text_l = list(replace_text)
                text_l[t] = comp
                replace_text = ''.join(text_l)
                split_t[n] = replace_text
            dist = ' '.join(split_t)
            result.append(dist)
        else:
            pass
    return result

In [None]:
def jamo_error_data(target_list: list):
    result = []
    for i in range(len(target_list)):
        # 띄어쓰기를 기준으로 split
        split_t = target_list[i].split()

        # Select sample number
        sample_num = random.randrange(1, 4)

        # Random Choose words
        if len(split_t) <= sample_num:
            sample_num = sample_num - 1
            rand_t = random.sample(range(len(split_t)), sample_num)
        else:
            rand_t = random.sample(range(len(split_t)), sample_num)

        for n in rand_t:
            # 한글 제외 문자 모두 제거
            replace_text = re.sub('[^ㄱ-힣]', '', split_t[n])
            replace_text = replace_text.replace(' ', '')

            t = random.choice(range(len(replace_text)))

            # 자음 모음 분리.
            decompose_text = decompose(replace_text[t])
            decompose_text = list(decompose_text)

            c = random.choice(range(len(decompose_text)))

            if decompose_text[c] in jongsung_list:
                if c == 0:
                    decompose_text[c] = random.choice(chosung_list)
                    comp = compose(decompose_text[c], decompose_text[1], decompose_text[-1])
                elif c == 2:
                    decompose_text[c] = random.choice(jongsung_list)
                    comp = compose(decompose_text[0], decompose_text[1], decompose_text[c])

            elif decompose_text[c] in jungsung_list:
                if len(decompose_text) == 2:
                    decompose_text[c] = random.choice(jungsung_list)
                    comp = compose(decompose_text[0], decompose_text[c])
                elif len(decompose_text) > 2:
                    decompose_text[c] = random.choice(jungsung_list)
                    comp = compose(decompose_text[0], decompose_text[c], decompose_text[-1])

            text_l = list(replace_text)
            text_l[t] = comp
            replace_text = ''.join(text_l)
            split_t[n] = replace_text
        dist = ' '.join(split_t)
        result.append(dist)
    return result