In [2]:
import pandas as pd
import torch
from datasets import load_dataset
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re
import itertools

BASE_CODE, CHOSUNG, JUNGSUNG = 44032, 588, 28
CHOSUNG_l = ['ㄱ','ㄲ','ㄴ','ㄷ','ㄸ','ㄹ','ㅁ','ㅂ','ㅃ','ㅅ','ㅆ','ㅇ','ㅈ','ㅉ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
JUNGSUNG_l = ['ㅏ','ㅐ','ㅑ','ㅒ','ㅓ','ㅔ','ㅕ','ㅖ','ㅗ','ㅘ','ㅙ','ㅚ','ㅛ','ㅜ','ㅝ','ㅞ','ㅟ','ㅠ','ㅡ','ㅢ','ㅣ']
JONGSUNG_l =['-','ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ','ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ','ㅁ','ㅂ',
             'ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
GYUP2CHO = {'ㄳ': 'ㄱㅅ','ㄵ': 'ㄴㅈ','ㄶ': 'ㄴㅎ','ㄺ': 'ㄹㄱ','ㄻ': 'ㄹㅁ','ㄽ': 'ㄹㅅ','ㄾ': 'ㄹㅌ','ㄿ': 'ㄹㅍ','ㅄ': 'ㅂㅅ'}

In [4]:
def remain_kor(x):
    return [''.join(re.findall(r'[ㄱ-ㅎㅏ-ㅣ각-힣]', i)) for i in x]

# 한국어만 남기기


In [5]:
def remain_kor_eng_num(x):
    return [''.join(re.findall(r'[ㄱ-ㅎㅏ-ㅣ가-힣A-Za-z0-9]', i)) for i in x]

In [6]:
def long2short(x):
    result = []
    keep = True
    for ele in x:
        while True:
            candidates = set(re.findall(r'(\w)\1', ele))
            repeats = itertools.chain(*[re.findall(r"({0}{0}+)".format(c), ele) for c in candidates])
            keep = False
            
            for org in [i for i in repeats if len(i) >=2]:
                ele = ele.replace(org,org[0])
                
                keep = True
            if not keep:
                break
        result.append(ele)
    return result



In [7]:
def text2chosung(x):
    split_keyword_list = list(x)
    result = []

    for keyword in split_keyword_list:
        if re.match(r'.*[가-힣]+.*', keyword) is not None:
            char_code = ord(keyword) - BASE_CODE

            char1 = int(char_code / CHOSUNG)
            result.append(CHOSUNG_l[char1])

            char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG)
            result.append(JUNGSUNG_l[char2])

            char3 = int((char_code - (CHOSUNG * char1) - (JUNGSUNG * char2)))
            result.append(JONGSUNG_l[char3])


        elif re.match('r[ㄱ-ㅎ]', keyword) is not None:
            result.append(keyword + '-')
        elif re.match(r'[ㅏ-ㅣ]', keyword) is not None:
            result.append('-' + keyword + '-')
        else:
            result.append(keyword)

    return ''.join(result)

def text2chosung_tolist(x):
    return [text2chosung(i) for i in x]

# 초성으로 바꾸기


In [8]:
def replace_gyup(x):
    result = []
    for ele in x:
        for gyup, cho in GYUP2CHO.items():
            ele = ele.replace(gyup, cho)
        result.append(ele)
    return result

# 겹모음 바꾸기

In [9]:
def text_preprocessing(x):
    texts = remain_kor(x)
    texts = long2short(texts)
    texts = text2chosung_tolist(texts)
    replace_gyup(texts)

    return (texts)


In [10]:
def text_preprocessing_can_eng_num(x):
    texts = remain_kor_eng_num(x)
    texts = long2short(texts)
    texts = text2chosung_tolist(texts)
    replace_gyup(texts)

    return (texts)

In [11]:
text = ['t1은 lck를 잘 이긴다 하하하하하.', 'faker는 롤의 신이다.']
print(text_preprocessing(text),'\n')
print(text_preprocessing_can_eng_num(text),'\n')

# 메세지에서 한국어와 영어만 남긴다.
# 텍스트에서 겹친 글자를 줄인다. (ㅋㅋㅋㅋㅋ > ㅋ)
# 텍스트를 초성으로 바꾼다.
# 바꾼 텍스트에서 겹모음을 바꾼다.

['ㅇㅡㄴㄹㅡㄹㅈㅏㄹㅇㅣ-ㄱㅣㄴㄷㅏ-ㅎㅏ-', 'ㄴㅡㄴㄹㅗㄹㅇㅢ-ㅅㅣㄴㅇㅣ-ㄷㅏ-'] 

['t1ㅇㅡㄴlckㄹㅡㄹㅈㅏㄹㅇㅣ-ㄱㅣㄴㄷㅏ-ㅎㅏ-', 'fakerㄴㅡㄴㄹㅗㄹㅇㅢ-ㅅㅣㄴㅇㅣ-ㄷㅏ-'] 

