### Import packages

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import re
import emoji
from soynlp.normalizer import repeat_normalize
from kss import split_sentences
import os

### Load Dataset

In [None]:
DATASET_PATH = '/content/drive/Shareddrives/AmorePacific2021/5.newcode/dataset/'
DATA_NAME = 'amore_data_above8_rating.csv'
df = pd.read_csv(DATASET_PATH + DATA_NAME)

## Review Text data
reviews = df.review_split.to_list()

### Preprocessing

In [None]:
emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')

'''
전처리: 이모지 처리, 문장부호 대체 등 기본적인 전처리만 수행
'''
def clean(x):
    x = pattern.sub(' ', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

contents = [clean(x) for x in reviews]

## Save preprocessed data
if not os.path.exists('amore_clean_reviews.txt'):
    with open('amore_clean_reviews.txt', 'w') as f:
        for doc in tqdm(contents):
            for line in split_sentences(doc):       ## Split reviews by sentences
                f.write(line+'\n')
            f.write('\n')

### Domain Adaptation by MLM

In [None]:
'''
You can run the below codes on command prompt
This is for adapting general KOREAN Bert model(based on Wikipedia) to BEAUTY domain(by using Amore review data)

Reference : https://github.com/Beomi/KcBERT & https://beomi.github.io/2021/03/15/KcBERT-MLM-Finetune/
Output : https://huggingface.co/Kyoungmin/beauty-base-KLCP2
'''
!mkdir ./test-mlm-amore

!python run_mlm.py \
    --model_name_or_path beomi/kcbert-base \
    --train_file amore_clean_reviews.txt \
    --do_train \
    --output_dir ./test-mlm-amore