In [1]:
import sys
sys.path.append('/workspace/travel_ai/huggingface_konlpy')

from tqdm import tqdm
import multiprocessing

from konlpy.tag import Mecab
from transformers_konlpy import KoNLPyBertTokenizer
from tokenizers_konlpy import KoNLPyWordPieceTokenizer
from transformers import BasicTokenizer

### Single Processing

In [2]:
tokenizer = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True),
    vocab_file = '../tokenizer/konlpy_tokenizer/vocab.txt'
)
basic_tokenizer = BasicTokenizer(do_lower_case=False)

In [4]:
%%time
with open('../data/preprocessed_corpus.txt') as f:
    cased_lines = f.read().splitlines()
len(cased_lines)

65543688

In [5]:
import random
random.shuffle(cased_lines)

In [14]:
l = len(cased_lines)
l

65543688

In [18]:
def tokenize(cased_lines, tokenizer, basic_tokenizer):
    def write_file(fout, cased_lines, mode):
        print(f'---------------------{mode} data set')
        for i, cased_line in enumerate(tqdm(cased_lines)):
            tokens = basic_tokenizer.tokenize(cased_line)
            split_tokens = []
            for token in tokens:
                subtokens = tokenizer.tokenize(token)
                split_tokens += subtokens
            fout.write(' '.join(split_tokens) + '\n')
    
    write_file(open('../data/train_tokened_corpus.txt', 'w'), cased_lines[:(l//10)*8], 'train')
    write_file(open('../data/valid_tokened_corpus.txt', 'w'), cased_lines[(l//10)*8:(l//10)*9], 'valid')
    write_file(open('../data/test_tokened_corpus.txt', 'w'), cased_lines[(l//10)*9:], 'test')

In [19]:
tokenize(cased_lines, tokenizer, basic_tokenizer)

---------------------train data set


100%|██████████| 52434944/52434944 [13:12:47<00:00, 1102.33it/s]  


---------------------valid data set


100%|██████████| 6554368/6554368 [1:39:03<00:00, 1102.83it/s]


---------------------test data set


100%|██████████| 6554376/6554376 [1:38:31<00:00, 1108.67it/s]


### Multiprocessing

In [2]:
import random
import ray
num_cpus = 10
ray.init(num_cpus=num_cpus, ignore_reinit_error=True, dashboard_host="0.0.0.0", dashboard_port=8265, include_dashboard=True)

def get_chunks(fpath, chunk_num):
    with open(fpath) as f:
        cased_lines = f.read().splitlines()
    random.shuffle(cased_lines)
    chunk_size = len(cased_lines) // chunk_num
    start = chunk_size
    for i in range(chunk_num):
        yield cased_lines[start:]
        start += chunk_size

@ray.remote
def tokenize(cased_lines, tokenizer, basic_tokenizer):
    sents = []
    for cased_line in cased_lines:
        tokens = basic_tokenizer.tokenize(cased_line)
        split_tokens = []
        for token in tokens:
            subtokens = tokenizer.tokenize(token)
            split_tokens += subtokens
        sents.append(split_tokens)
    return sents


def process(cased_file, output_file, bert_model_type='bert-base-cased', workers=num_cpus):
    tokenizer = KoNLPyBertTokenizer(
        konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True),
        vocab_file = '../tokenizer/konlpy_tokenizer/vocab.txt'
    )
    basic_tokenizer = BasicTokenizer(do_lower_case=False)
    fout = open(output_file, 'w')
    futures = [tokenize.remote(chunked_list) for chunked_list in get_chunks(cased_file)]
    results = ray.get(futures)
    print(f'total tokenized sentences : {len(results)}')
    for lines in tqdm(results):
        for i, line in enumerate(lines):
            fout.write(' '.join(line) + '\n')
    fout.close()
    ray.shutdown()

In [None]:
try:
    process('../data/preprocessed_corpus.txt', '../data/tokenized_corpus.txt')
except:
    ray.shutdown()
finally:
    ray.shutdown()