# HuggingFace Examples

In [1]:
from transformers import AutoTokenizer, PreTrainedTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

In [2]:
model_name = 'monologg/kobigbird-bert-base'
fill_mask = pipeline(
    "fill-mask",
    model=model_name,
    tokenizer=model_name
)

Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
fill_mask("미국의 수도는 [MASK]이다.")

Attention type 'block_sparse' is not possible if sequence_length: 10 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


[{'score': 0.5767750144004822,
  'token': 10821,
  'token_str': '워싱턴',
  'sequence': '미국의 수도는 워싱턴 이다.'},
 {'score': 0.09579449146986008,
  'token': 8386,
  'token_str': '뉴욕',
  'sequence': '미국의 수도는 뉴욕 이다.'},
 {'score': 0.06737150251865387,
  'token': 13130,
  'token_str': 'LA',
  'sequence': '미국의 수도는 LA 이다.'},
 {'score': 0.05947323143482208,
  'token': 7581,
  'token_str': '수도',
  'sequence': '미국의 수도는 수도 이다.'},
 {'score': 0.03958961367607117,
  'token': 21661,
  'token_str': '필라델피아',
  'sequence': '미국의 수도는 필라델피아 이다.'}]

In [4]:
fill_mask("나는 [MASK]를 먹는다.")

[{'score': 0.045219432562589645,
  'token': 2760,
  'token_str': '나',
  'sequence': '나는 나 를 먹는다.'},
 {'score': 0.03540102019906044,
  'token': 10661,
  'token_str': '우유',
  'sequence': '나는 우유 를 먹는다.'},
 {'score': 0.03112361952662468,
  'token': 8512,
  'token_str': '고기',
  'sequence': '나는 고기 를 먹는다.'},
 {'score': 0.026315296068787575,
  'token': 15610,
  'token_str': '고구마',
  'sequence': '나는 고구마 를 먹는다.'},
 {'score': 0.01981933042407036,
  'token': 14976,
  'token_str': '쇠고기',
  'sequence': '나는 쇠고기 를 먹는다.'}]

In [5]:
fill_mask("한국의 [MASK]는 인천이다.")

[{'score': 0.8281315565109253,
  'token': 7581,
  'token_str': '수도',
  'sequence': '한국의 수도 는 인천이다.'},
 {'score': 0.015533183701336384,
  'token': 6936,
  'token_str': '도시',
  'sequence': '한국의 도시 는 인천이다.'},
 {'score': 0.008934497833251953,
  'token': 9879,
  'token_str': '고도',
  'sequence': '한국의 고도 는 인천이다.'},
 {'score': 0.008657638914883137,
  'token': 12588,
  'token_str': '중심지',
  'sequence': '한국의 중심지 는 인천이다.'},
 {'score': 0.005730825942009687,
  'token': 20365,
  'token_str': '소재지',
  'sequence': '한국의 소재지 는 인천이다.'}]

In [6]:
fill_mask("인천 [MASK] 대학교")

[{'score': 0.07572885602712631,
  'token': 7841,
  'token_str': '소재',
  'sequence': '인천 소재 대학교'},
 {'score': 0.06376954168081284,
  'token': 6816,
  'token_str': '지역',
  'sequence': '인천 지역 대학교'},
 {'score': 0.053160928189754486,
  'token': 517,
  'token_str': '-',
  'sequence': '인천 - 대학교'},
 {'score': 0.04669084772467613,
  'token': 7019,
  'token_str': '국제',
  'sequence': '인천 국제 대학교'},
 {'score': 0.03119579143822193,
  'token': 12504,
  'token_str': '시립',
  'sequence': '인천 시립 대학교'}]

# KoBERT

In [1]:
from transformers import AutoTokenizer, PreTrainedTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

In [7]:
tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1', use_fast=False)

In [8]:
tokenizer

PreTrainedTokenizer(name_or_path='skt/kobert-base-v1', vocab_size=8002, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [9]:
tokenizer.tokenize('한국어 모델을 공유합니다.')

['▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.']

In [10]:
tokenizer.encode("한국어 모델을 공유합니다.")

[4958, 6855, 2046, 7088, 1050, 7843, 54, 3, 2]

## Sentiment Analysis using pretrained model(w. klue/bert-base)

https://huggingface.co/docs/transformers/custom_datasets

In [2]:
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels=2)
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [3]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [4]:
import json
import os
import torch

from collections import Counter
from datasets import Dataset
from transformers import Trainer, TrainingArguments

In [5]:
raw_paths = [os.path.join('nsmc', 'raw', path) for path in os.listdir('nsmc/raw')]

In [38]:
dat = {'text': [], 'label': []}

In [39]:
labels = []
for path in raw_paths:
    with open(path) as f:
        data = json.load(f)
    for i in data:
        if int(i['rating']) >= 9:
            dat['text'].append(i['review'])
            dat['label'].append(1)
        elif int(i['rating']) <= 4:
            dat['text'].append(i['review'])
            dat['label'].append(0)

In [40]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

In [41]:
train_dat = {'text': dat['text'][:1000], 'label': dat['label'][:1000]}
train_dat = Dataset.from_dict(train_dat, split='train')
train_dataset = train_dat.map(preprocess_function, batched=True, num_proc=7)
train_dataset = train_dataset.remove_columns('text')

In [42]:
valid_dat = {'text': dat['text'][1000:2000], 'label': dat['label'][1000:2000]}
valid_dat = Dataset.from_dict(valid_dat, split='train')
valid_dataset = valid_dat.map(preprocess_function, batched=True, num_proc=7)
valid_dataset = valid_dataset.remove_columns('text')

In [43]:
from datasets import load_metric
import numpy as np
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [44]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [45]:
valid_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [50]:
args = TrainingArguments(output_dir='./sentiment',
                         per_device_train_batch_size=4,
                         logging_steps=100,
                         evaluation_strategy='epoch',
                         do_eval=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [51]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset=valid_dataset,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [52]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1305,1.09688,0.831
2,0.0684,0.926864,0.873
3,0.0635,0.924929,0.876


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./sentiment/checkpoint-500
Configuration saved in ./sentiment/checkpoint-500/config.json
Model weights saved in ./sentiment/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./sentiment/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./sentiment/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.0730759978418549, metrics={'train_runtime': 233.4359, 'train_samples_per_second': 12.851, 'train_steps_per_second': 3.213, 'total_flos': 789333166080000.0, 'train_loss': 0.0730759978418549, 'epoch': 3.0})

# KoGPT2

In [53]:
from transformers import GPT2LMHeadModel
from transformers import PreTrainedTokenizerFast

In [54]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2',
                                                    bos_token='<s>',
                                                    eos_token='</s>',
                                                    pad_token='<pad>',
                                                    mask_token='<mask>')

loading file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/fd8418e6675550cbca8ad6c102d717aa89372eb7a632ad3168300c7fed43491c.db074bfdd88bec54455de5ee2400efdbc64d4acf449a44d5f314e79c1eadc611
loading file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/13bb826cf24517d7849a701e02452715a67c5e560142be3d4735442b2a545809.6b384eec6effdd44287f67715cd55bd0dff2cf846d843b932b43ba7b632b8b1e
Model config GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "archite

In [55]:
tokenizer.encode("안녕하세요. 한국어 GPT-2 모델입니다.")

[25906, 8702, 7801, 25856, 34407, 10528, 422, 426, 18258, 14652, 21154]

In [56]:
import torch

In [57]:
text = "근육이 커지기 위해서는"
input_ids = tokenizer.encode(text)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
gen_ids = model.generate(torch.tensor([input_ids]),
                        max_length=128,
                        repetition_penalty=2.0,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id,
                        bos_token_id=tokenizer.bos_token_id,
                        use_cache=True)
generated = tokenizer.decode(gen_ids[0,:].tolist())

loading configuration file https://huggingface.co/skt/kogpt2-base-v2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/13bb826cf24517d7849a701e02452715a67c5e560142be3d4735442b2a545809.6b384eec6effdd44287f67715cd55bd0dff2cf846d843b932b43ba7b632b8b1e
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_at

In [58]:
generated

'근육이 커지기 위해서는 무엇보다 규칙적인 생활습관이 중요하다.\n특히, 아침식사는 단백질과 비타민이 풍부한 과일과 채소를 많이 섭취하는 것이 좋다.\n또한 하루 30분 이상 충분한 수면을 취하는 것도 도움이 된다.\n아침 식사를 거르지 않고 규칙적으로 운동을 하면 혈액순환에 도움을 줄 뿐만 아니라 신진대사를 촉진해 체내 노폐물을 배출하고 혈압을 낮춰준다.\n운동은 하루에 10분 정도만 하는 게 좋으며 운동 후에는 반드시 스트레칭을 통해 근육량을 늘리고 유연성을 높여야 한다.\n운동 후 바로 잠자리에 드는 것은 피해야 하며 특히 아침에 일어나면 몸이 피곤해지기 때문에 무리하게 움직이면 오히려 역효과가 날 수도 있다.\n운동을'

# KoBART

In [59]:
model_name = 'gogamza/kobart-base-v2'

In [60]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gogamza/kobart-base-v2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/54a37e9385f90886428b084042f151c1a699203416d41765d94aac4cddb5fd5c.d098ef3866c1da94bdfaa5c1f24ecb7c5c16b37423b79263fbd3668d2ae61f91
Model config BartConfig {
  "_name_or_path": "gogamza/kobart-base-v2",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_

In [61]:
tokenizer.tokenize("안녕하세요. 한국어 BART입니다.ㅁ:)ㅣ^o")

['▁안녕하',
 '세요.',
 '▁한국어',
 '▁B',
 'A',
 'R',
 'T',
 '입니다.',
 'ᄆ',
 ':)',
 '▁',
 'ᅵ',
 '^',
 'o']

In [62]:
from transformers import AutoModel

In [63]:
model = AutoModel.from_pretrained(model_name)

loading configuration file https://huggingface.co/gogamza/kobart-base-v2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/54a37e9385f90886428b084042f151c1a699203416d41765d94aac4cddb5fd5c.d098ef3866c1da94bdfaa5c1f24ecb7c5c16b37423b79263fbd3668d2ae61f91
Model config BartConfig {
  "_name_or_path": "gogamza/kobart-base-v2",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "

In [64]:
inputs = tokenizer(['안녕하세요'], return_tensors='pt')

In [65]:
inputs

{'input_ids': tensor([[22465, 23935]]), 'token_type_ids': tensor([[0, 0]]), 'attention_mask': tensor([[1, 1]])}

In [66]:
results = model(inputs['input_ids'])

In [67]:
results.keys()

odict_keys(['last_hidden_state', 'past_key_values', 'encoder_last_hidden_state'])

In [68]:
results['encoder_last_hidden_state'].shape # (batch_size, sequence_length, hidden state dimension)

torch.Size([1, 2, 768])

In [69]:
results['last_hidden_state'].shape

torch.Size([1, 2, 768])

## BART for summarization

In [70]:
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration

In [71]:
model_name = 'gogamza/kobart-summarization'

In [72]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

loading file https://huggingface.co/gogamza/kobart-summarization/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/4369897f91813214377063544fb9a44ad537ca3a2559c7bdc98eaf9d934d4a89.dc2013f8bbecd755468e2c44397f53dc624be5451d0190744397caf61a20383f
loading file https://huggingface.co/gogamza/kobart-summarization/resolve/main/added_tokens.json from cache at /root/.cache/huggingface/transformers/c8171f2310611c5f6994c35b7016633d42194eb424192baa1910c896fdd197f6.04312f398a3bbda664297588800a86e0fda9d4ef4f0749cd9d96f88043daad39
loading file https://huggingface.co/gogamza/kobart-summarization/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/aed722871fe9f8d064a1df70dcfe967be2f02797eaaaf6ee28ffd2c59d7514e9.15447ae63ad4a2eba8bc7a5146360711dc32b315b4f1488b4806debf35315e9a
loading file https://huggingface.co/gogamza/kobart-summarization/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingfa

In [73]:
text = "과거를 떠올려보자. 방송을 보던 우리의 모습을. 독보적인 매체는 TV였다. 온 가족이 둘러앉아 TV를 봤다. 간혹 가족들끼리 뉴스와 드라마, 예능 프로그램을 둘러싸고 리모컨 쟁탈전이 벌어지기도 했다. 각자 선호하는 프로그램을 '본방'으로 보기 위한 싸움이었다. TV가 한 대인지 두 대인지 여부도 그래서 중요했다. 지금은 어떤가. '안방극장'이라는 말은 옛말이 됐다. TV가 없는 집도 많다. 미디어의 혜택을 누릴 수 있는 방법은 늘어났다. 각자의 방에서 각자의 휴대폰으로, 노트북으로, 태블릿으로 콘텐츠를 즐긴다."
print(text)

과거를 떠올려보자. 방송을 보던 우리의 모습을. 독보적인 매체는 TV였다. 온 가족이 둘러앉아 TV를 봤다. 간혹 가족들끼리 뉴스와 드라마, 예능 프로그램을 둘러싸고 리모컨 쟁탈전이 벌어지기도 했다. 각자 선호하는 프로그램을 '본방'으로 보기 위한 싸움이었다. TV가 한 대인지 두 대인지 여부도 그래서 중요했다. 지금은 어떤가. '안방극장'이라는 말은 옛말이 됐다. TV가 없는 집도 많다. 미디어의 혜택을 누릴 수 있는 방법은 늘어났다. 각자의 방에서 각자의 휴대폰으로, 노트북으로, 태블릿으로 콘텐츠를 즐긴다.


In [74]:
raw_input_ids = tokenizer.encode(text)
input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]

In [75]:
summary_ids = model.generate(torch.tensor([input_ids]))

In [76]:
tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)

'TV가 없는 집도 많고, TV가 없는 집도 많아진 만큼 미디어의 혜택을'

In [77]:
summary_ids

tensor([[    2, 16132,  8981, 14426, 14230,  9866, 14178, 14161, 16132,  8981,
         14426, 14230,  9866, 16664, 12335, 14933, 17166, 12024, 18477,     2]])

In [78]:
input_ids

[0,
 15320,
 10443,
 17697,
 10313,
 10884,
 12060,
 245,
 21801,
 14046,
 9810,
 16433,
 15266,
 245,
 14373,
 10884,
 14134,
 14174,
 18610,
 16132,
 20029,
 14488,
 22795,
 16215,
 11700,
 11696,
 16132,
 10443,
 16786,
 14130,
 14313,
 13700,
 14978,
 9993,
 19769,
 15126,
 11863,
 15891,
 243,
 19517,
 17624,
 25749,
 14420,
 10607,
 12924,
 18787,
 13128,
 16984,
 16557,
 26499,
 19754,
 24252,
 26046,
 17624,
 14063,
 10888,
 10788,
 17075,
 17238,
 14353,
 20155,
 19249,
 16132,
 8981,
 14036,
 14029,
 15539,
 14196,
 14029,
 15539,
 20771,
 9866,
 14955,
 14610,
 15615,
 17444,
 14593,
 8981,
 245,
 26907,
 10788,
 29540,
 17164,
 18220,
 16565,
 25649,
 15097,
 14130,
 16132,
 8981,
 14426,
 14230,
 9866,
 14178,
 14130,
 17166,
 12024,
 18477,
 25689,
 14032,
 14082,
 20913,
 29553,
 14130,
 26667,
 14110,
 14030,
 26667,
 19594,
 16077,
 27476,
 16077,
 21980,
 14027,
 23786,
 14999,
 9267,
 14130,
 1]