In [1]:
import torch
torch.cuda.is_available()

True

https://huggingface.co/docs/transformers/model_doc/bert

In [2]:
# load tokenizer
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast

tokenizer = BertTokenizerFast(
    vocab_file = './hf_tokenizer_special/vocab.txt',
    max_len = 128,
    do_lower_case=False,
)

In [3]:
print(tokenizer.tokenize('曔은 [MASK] 엄청 맛있었고 촉촉하고 바삭했어요.'))
tokenizer.add_special_tokens({'mask_token':'[MASK]'})
print(tokenizer.tokenize('曔은 [MASK] 엄청 맛있었고 촉촉하고 바삭했어요.'))

['[UNK]', '은', '[UNK]', 'MA', '##S', '##K', '[UNK]', '엄청', '맛있', '##었', '##고', '촉촉', '##하고', '바삭', '##했', '##어요', '.']
['[UNK]', '은', '[MASK]', '엄청', '맛있', '##었', '##고', '촉촉', '##하고', '바삭', '##했', '##어요', '.']


In [4]:
config = BertConfig(
    vocab_size = 30000, #####
    # hidden_size=512,
    # num_hidden_layers=12,
    # num_attention_heads=8,
    # intermediate_size=3072, # transformer 내의 feed-forward network dimension size
    # hidden_act='gelu',
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    max_position_embedding=128, # limit num of tokens from sentence
    # type_vocab_size=2,
    # pad_token_id=0,
    # position_embedding_type='absolute',
)

model = BertForPreTraining(config=config)
model.num_parameters()

109705010

In [5]:
import os
import json
import pickle
import random
import time
import warnings
from filelock import FileLock

from torch.utils.data.dataset import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers.utils import logging
from typing import Dict, List, Optional

logger = logging.get_logger(__name__)

In [6]:
# making corpus 
# \n : separate sentence. \n\n : separate document.
from dbConn.mongo_conn import config

conn = config()
col = conn["travel_ai"].blog_contents
contents = col.find({"num_docs": {"$gt": 1}}, {"cleaned_content": 1})
input_f = "./data/for_pretrain_corpus.txt"

f = open(input_f, "w")
for cont in contents:
    docs = [c for c in cont['cleaned_content']]
    f.write('\n'.join(docs))
    f.write('\n')
del contents

f.close()
conn.close()

In [7]:
# BERT task 1 : NSP, Next Sentence Prediction

class TextDatasetForNextSentencePrediction(Dataset):
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):
        # training data caching
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_nsp_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        self.tokenizer = tokenizer

        lock_path = cached_features_file + ".lock"

        # Input file format:
        # (1) One sentence per line. These should ideally be actual sentences, not
        # entire paragraphs or arbitrary spans of text. (Because we use the
        # sentence boundaries for the "next sentence prediction" task).
        # (2) Blank lines between documents. Document boundaries are needed so
        # that the "next sentence prediction" task doesn't span between documents.
        #
        # Example:
        # I am very happy.
        # Here is the second sentence.
        #
        # A new document.

        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else: # no cached data
                logger.info(f"Creating features from dataset file at {directory}")
                self.documents = [[]] # training per document
                with open(file_path, encoding="utf-8") as f:
                    while True:
                        line = f.readline()
                        if not line:
                            break
                        line = line.strip()

                        # \n\n -> documnet
                        if not line and len(self.documents[-1]) != 0:
                            self.documents.append([])
                        tokens = tokenizer.tokenize(line)
                        tokens = tokenizer.convert_tokens_to_ids(tokens)
                        if tokens:
                            self.documents[-1].append(tokens)
                logger.info(f"Creating examples from {len(self.documents)} documents.")
                self.examples = []
                # transform for training data
                for doc_index, document in enumerate(self.documents):
                    self.create_examples_from_document(document, doc_index) # 함수로 가봅시다.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
        """Creates examples for a single document."""
        # size - 2 because of [CLS], [SEP] token, so limit max_position_embedding - 2 each sentence
        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted
        # computation. However, we *sometimes*
        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
        # sequences to minimize the mismatch between pretraining and fine-tuning.
        # The `target_seq_length` is just a rough target however, whereas
        # `block_size` is a hard limit.

        target_seq_length = max_num_tokens
        if random.random() < self.short_seq_probability: # short_seq_probability 2 ~ (max_position_embedding - 2) 사이의 랜덤 학습 데이터 생성
            target_seq_length = random.randint(2, max_num_tokens)

        current_chunk = []  # a buffer stored current working segments
        current_length = 0
        i = 0

        # sentence_1[SEP]sentence_2 이 기본이나, (max_position_embedding - 2)길이의 토큰을 채울 수 있도록
        # sentence_1+sentence_2[SEP]sentence_3+sentence_4 형태로 만들어질 수 있음
        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document) - 1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1
                    # sentence_1+sentence_2 가 이루어졌을 때, 길이를 random하게 자름
                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)
                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])
                    # [SEP] 뒷부분 문장 처리
                    tokens_b = []
                    # 50%의 확률로 랜덤하게 다른 문장을 선택하거나, 다음 문장을 학습데이터로 생성.
                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # This should rarely go for more than one iteration for large
                        # corpora. However, just to be careful, we try to make sure that
                        # the random document is not the same as the document
                        # we're processing.
                        for _ in range(10):
                            random_document_index = random.randint(0, len(self.documents) - 1)
                            if random_document_index != doc_index:
                                break
                        # ransom sampling
                        random_document = self.documents[random_document_index]
                        random_start = random.randint(0, len(random_document) - 1)
                        for j in range(random_start, len(random_document)):
                            tokens_b.extend(random_document[j])
                            if len(tokens_b) >= target_b_length:
                                break
                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments
                    # Actual next
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    # over (max_position_embedding - 2), remove segmentA or segmentB randomly
                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                        """Truncates a pair of sequences to a maximum sequence length."""
                        while True:
                            total_length = len(tokens_a) + len(tokens_b)
                            if total_length <= max_num_tokens:
                                break
                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                            assert len(trunc_tokens) >= 1
                            # We want to sometimes truncate from the front and sometimes from the
                            # back to add more randomness and avoid biases.
                            if random.random() < 0.5:
                                del trunc_tokens[0]
                            else:
                                trunc_tokens.pop()

                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                    assert len(tokens_a) >= 1
                    assert len(tokens_b) >= 1

                    # add special tokens
                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                    # add token type ids, 0 for sentence a, 1 for sentence b
                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                    
                    # completed making NSP dataset
                    example = {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                    }

                    self.examples.append(example)

                current_chunk = []
                current_length = 0

            i += 1

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

### Bert Architecture
- BASE : transformer block(L):12, hidden layer size(H):768, Attention head(A):12
- LARGE : transformer block(L):24, hidden layer size(H):1024, Attention head(A):16

In [8]:
%%time
dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="./data/for_pretrain_corpus.txt",
    block_size=128,
    overwrite_cache=False,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)

data_collator = DataCollatorForLanguageModeling(    # task 2: no need extra implement for MSM [MASK]
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Token indices sequence length is longer than the specified maximum sequence length for this model (139 > 128). Running this sequence through the model will result in indexing errors


CPU times: user 8min 8s, sys: 5.78 s, total: 8min 14s
Wall time: 8min 21s


In [9]:
# check dataset
for example in dataset.examples[0:1]:
    print(example)

{'input_ids': tensor([    2, 19256,  8432,  8803, 12993, 16017,  7147, 10416, 20764, 19256,
         8432, 27446,    75, 19256,  8432,  9415,  9798, 19256,  8432,  4263,
         4767,  7147, 10416, 19256,  8432,  8803,  9798, 28536, 19256,  8432,
         4650,  7027, 27446,  8754,  7355,  4240,  8803,  7355,  4263,  4767,
         7355, 10416, 19256,  8432,  7007,  4274,  4206,  4214,  4266,  6650,
         2234,  4477,  4382,  6843, 10483,  8159,  4240,  6959,  8637,  7443,
         4633,  8730,  4444,  1710, 17246,  6925,  9751,     3,  7010,  8159,
        19752,  4444,  6552,  4494,  8996,  4330,  4328,  6926,  4478,  6703,
        10483,     9,  6952,  4444,  8996,  4045, 16041,  4391,  2755,  4291,
         7693,  4237,  7738,  2136,  2755, 16967,     9,  6644,     7,  6490,
         9859,  2244,  4229,  4291,     5, 10412,  5273,  4144,  4277,  4942,
         6551,  8996,  4330,  4900,  8996,  4330,  4328,  6926,  7010,  8159,
         8240,  6644, 13087,  8292,     3]), 'toke

In [10]:
# check MSM data collator
print(data_collator(dataset.examples))

{'input_ids': tensor([[    2, 19256,     4,  ..., 13087,  8292,     3],
        [    2,  7355,  4240,  ...,  4196,     4,     3],
        [    2,  7010, 18911,  ...,  6593,  6483,     3],
        ...,
        [    2,  4023,  4330,  ...,     4, 10795,     3],
        [    2,  8210,     4,  ..., 11215, 11652,     3],
        [    2,  7160, 28685,  ...,  4494,  4371,     3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'next_sentence_label': tensor([0, 1, 1,  ..., 0, 1, 1]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, 8432,  ..., -100, -100, -100],
        [-100, -100, -100,  ..

In [11]:
print(data_collator(dataset.examples)['input_ids'][0])

tensor([    2, 19256,  8432,  8803, 12993, 16017,  7147, 10416, 20764, 19256,
         8432, 27446,     4,     4,  8432,  9415,  9798, 19256,  8432,  4263,
         4767,  7147, 10416, 19256,  8432,  8803,  9798, 28536, 19256,  8432,
         4650,  7027, 27446,  8754,     4,  4240,  8803,  7355,  4263,  4767,
         7355, 10416, 19256,  8432,  7007,  4274,  4206,  4214,  4266,  6650,
         2234,  4477,     4,  6843, 10483,  8159,     4, 16082,  8637,  7443,
            4,  8730,     4,  1710, 17246,  6925,  9751,     3,  7010,  8159,
        19752,  4444,  6552,  4494,  8996,  4330,  4328,  6926,     4,  6703,
        10483,     9,  6952,  4444,  8996,  4045, 16041,  4391,     4,  4291,
            4,  4237,     4,  2136,  2755,     4,     9,  6644,     7,  6490,
         9859,  2244,  4229,     4,     5, 10412,  5273,  4144,  4277,  4942,
         6551,  8996,     4,  4900,  8996,     4,  4328,  6926,  7010,  8159,
         8240,  6644, 13087,  8292,     3])


In [12]:
tokenizer.decode(data_collator(dataset.examples)['input_ids'][0].tolist())

'[CLS] 까막바위 일출 해맞이 해돋이 겨울바다 한파 까막바위에서 ~ 까막바위 대한민국 동해시 까막바위일출 겨울바다 까막바위 일출 동해시 묵호진동 까막바위옆 숙소에서 바라본 동해의 일출 동해일출 [MASK]바다 까막바위 자연은있는그대로 심 모였당 사랑합니다 고성의 모든 여행지 정보를 [MASK]에 [MASK]려면 아래 [MASK] [SEP] 강원도 고성 죽왕면에 위치한 가진 [MASK]늠욕장을 소개합니다. [MASK]에 가진 회센터 [MASK] 있어 물회도 드실 수 있습니다. 바다 [MASK] 정말 보고 싶었어! 양평댁 히나 [MASK] 여행 가진해변 가진해수욕장 강원도 [MASK] [MASK] 바다여행 휴식 [SEP]'

In [13]:
# start pretrain

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32, # 32
    save_steps=10000, # model save each step
    save_total_limit=2, # only save last 2 models
    logging_steps=10000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 930019
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 290640


Step,Training Loss
10000,7.2565
20000,5.5005
30000,4.3454
40000,3.8809
50000,3.6068
60000,3.4168
70000,3.2491
80000,3.1525
90000,3.0557
100000,2.9667


Saving model checkpoint to model_output/checkpoint-10000
Configuration saved in model_output/checkpoint-10000/config.json
Model weights saved in model_output/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to model_output/checkpoint-20000
Configuration saved in model_output/checkpoint-20000/config.json
Model weights saved in model_output/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to model_output/checkpoint-30000
Configuration saved in model_output/checkpoint-30000/config.json
Model weights saved in model_output/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to model_output/checkpoint-40000
Configuration saved in model_output/checkpoint-40000/config.json
Model weights saved in model_output/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to model_output/checkpoint-500

TrainOutput(global_step=290640, training_loss=3.0747561038180287, metrics={'train_runtime': 55063.0656, 'train_samples_per_second': 168.901, 'train_steps_per_second': 5.278, 'total_flos': 6.017473430617651e+17, 'train_loss': 3.0747561038180287, 'epoch': 10.0})

In [18]:
trainer.save_model('./model_output')

Saving model checkpoint to ./model_output
Configuration saved in ./model_output/config.json
Model weights saved in ./model_output/pytorch_model.bin


In [16]:
from transformers import BertForMaskedLM, pipeline

In [19]:
my_model = BertForMaskedLM.from_pretrained('model_output')

loading configuration file model_output/config.json
Model config BertConfig {
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedding": 128,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file model_output/pytorch_model.bin
Some weights of the model checkpoint at model_output were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a

In [20]:
tokenizer.tokenize('오늘 카페에가서 [MASK]를 먹었는데요, 너무 맛있떠라구요.')

['오늘',
 '카페',
 '##에',
 '##가',
 '##서',
 '[MASK]',
 '를',
 '먹',
 '##었',
 '##는데',
 '##요',
 ',',
 '너무',
 '맛있',
 '##떠',
 '##라',
 '##구요',
 '.']

In [21]:
nlp_fill = pipeline('fill-mask', top_k=5, model=my_model, tokenizer=tokenizer)
nlp_fill('오늘 카페에가서 [MASK]를 먹었는데요, 너무 맛있떠라구요.')

[{'score': 0.24728845059871674,
  'token': 7092,
  'token_str': '아메리카노',
  'sequence': '[CLS] 오늘 카페에가서 아메리카노 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.1023724377155304,
  'token': 6527,
  'token_str': '커피',
  'sequence': '[CLS] 오늘 카페에가서 커피 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.04726902395486832,
  'token': 8890,
  'token_str': '아아',
  'sequence': '[CLS] 오늘 카페에가서 아아 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.040487710386514664,
  'token': 6798,
  'token_str': '디저트',
  'sequence': '[CLS] 오늘 카페에가서 디저트 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.038824670016765594,
  'token': 10955,
  'token_str': '팥빙수',
  'sequence': '[CLS] 오늘 카페에가서 팥빙수 를 먹었는데요, 너무 맛있떠라구요. [SEP]'}]

In [22]:
nlp_fill('[MASK]를 먹었는데요, 너무 맛있떠라구요.')

[{'score': 0.02833535522222519,
  'token': 6,
  'token_str': ')',
  'sequence': '[CLS] ) 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.025303687900304794,
  'token': 8570,
  'token_str': '고등어',
  'sequence': '[CLS] 고등어 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.020374711602926254,
  'token': 7121,
  'token_str': '장어',
  'sequence': '[CLS] 장어 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.020118871703743935,
  'token': 7989,
  'token_str': '떡갈비',
  'sequence': '[CLS] 떡갈비 를 먹었는데요, 너무 맛있떠라구요. [SEP]'},
 {'score': 0.014281458221375942,
  'token': 22,
  'token_str': '?',
  'sequence': '[CLS]? 를 먹었는데요, 너무 맛있떠라구요. [SEP]'}]