In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import json

from unicode import split_syllables, join_jamos
from tqdm import tqdm

## Data Load

In [3]:
text = "text"

In [4]:
all_data = load_dataset('csv',data_files='./order_speech_ko.csv',split='train')

Using custom data configuration default-8e0d0b8910acc620
Reusing dataset csv (C:\Users\Mu-jun\.cache\huggingface\datasets\csv\default-8e0d0b8910acc620\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


## Data Cleaning

In [5]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch[text] = re.sub(chars_to_ignore_regex, '', batch[text]).lower() + " "
    return batch

In [6]:
remove_spectial_char_data = all_data.map(remove_special_characters)



  0%|          | 0/142367 [00:00<?, ?ex/s]

## Split to Font

In [7]:
def extract_all_chars(batch):
    all_text = " ".join(batch[text])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [8]:
char_vocab = remove_spectial_char_data.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=remove_spectial_char_data.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

## Add index

In [9]:
vocab_list = list(set(char_vocab["vocab"][0]))

In [10]:
vocab_dict = {v: k+5 for k, v in enumerate(vocab_list)}

## Add Special Token

In [11]:
vocab_dict["|"] = 4
vocab_dict["<pad>"] = 0
vocab_dict["<s>"] = 1
vocab_dict["</s>"] = 2
vocab_dict["<unk>"] = 3
len(vocab_dict)

605

## Export Vocabulary to json file

In [13]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [12]:
vocab_dict

{'그': 5,
 '떨': 6,
 '궁': 7,
 '했': 8,
 '덤': 9,
 '욱': 10,
 '청': 11,
 '깥': 12,
 '겟': 13,
 '갔': 14,
 '습': 15,
 '명': 16,
 '반': 17,
 '괜': 18,
 '놓': 19,
 '텔': 20,
 '탑': 21,
 '람': 22,
 '호': 23,
 '되': 24,
 '가': 25,
 '캐': 26,
 '막': 27,
 '원': 28,
 '된': 29,
 '하': 30,
 '감': 31,
 '달': 32,
 '늦': 33,
 '고': 34,
 '갈': 35,
 '익': 36,
 '쫌': 37,
 '등': 38,
 '슬': 39,
 '뀌': 40,
 '쉼': 41,
 '력': 42,
 '모': 43,
 '코': 44,
 '프': 45,
 '당': 46,
 '체': 47,
 '라': 48,
 '귀': 49,
 '메': 50,
 '심': 51,
 '존': 52,
 '얘': 53,
 '것': 54,
 '미': 55,
 '보': 56,
 '직': 57,
 '음': 58,
 '닭': 59,
 '셔': 60,
 '킬': 61,
 '광': 62,
 '식': 63,
 '떻': 64,
 '연': 65,
 '행': 66,
 '말': 67,
 '뭔': 68,
 '오': 69,
 '긴': 70,
 '씬': 71,
 '월': 72,
 '둬': 73,
 '엌': 74,
 '렇': 75,
 '와': 76,
 '랑': 77,
 '삼': 78,
 '건': 79,
 '몰': 80,
 '딱': 81,
 '린': 82,
 '녹': 83,
 '켤': 84,
 '취': 85,
 '봐': 86,
 '까': 87,
 '색': 88,
 '머': 89,
 '놔': 90,
 '절': 91,
 '남': 92,
 '액': 93,
 '꿀': 94,
 '브': 95,
 '총': 96,
 '드': 97,
 '답': 98,
 '찾': 99,
 '꺼': 100,
 '흐': 101,
 '낫': 102,
 '한': 103,
 '기': 104,
