In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 24.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

```
from transformers import TFAutoModel, AutoTokenizer
model = TFAutoModel.from_pretrained("<model_name>")
tokenizer = AutoTokenizer.from_pretrained("<model_name>")
```

## Tokenizer 실습

In [2]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer

In [75]:
model_name = "bert-base-multilingual-cased"

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [76]:
print(tokenizer.vocab_size)

119547


In [77]:
for i, key in enumerate(tokenizer.get_vocab()):
    print(key)
    if i>20:
        break

vary
sebelumnya
protagonist
Dancing
nüfusu
ספרי
поверхні
состояние
இன்
novembril
satu
kỳ
##聞
##鲑
географин
##rmática
jurul
dog
##stin
төшә
׃
稀


In [78]:
text = "이순신은 조선 중기의 무신이다."

In [79]:
print(type(tokenizer))

<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


In [80]:
tokenized_input_text = tokenizer(text, return_tensors="pt")

In [81]:
for key, value in tokenized_input_text.items():
    print("{} : \n\t{}".format(key,value))

input_ids : 
	tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
token_type_ids : 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask : 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [82]:
print(tokenized_input_text['input_ids']) # vocab id
print(tokenized_input_text.input_ids)

tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])


In [83]:
print(tokenized_input_text['token_type_ids']) # segment id
print(tokenized_input_text.token_type_ids)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [84]:
print(tokenized_input_text['attention_mask']) # is special token
print(tokenized_input_text.attention_mask)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [85]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text) # ##은 앞의 토큰와 붙어있는 단어임을 알려준다.

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.']


In [86]:
print(tokenizer.tokenize("요리보고"))

['요', '##리', '##보', '##고']


In [87]:
input_ids = tokenizer.encode(text, add_special_tokens=True)
print(input_ids)

[101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]


In [88]:
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

[CLS] 이순신은 조선 중기의 무신이다. [SEP]


## option

In [89]:
tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens = False,
    max_length = 5,
    truncation = True
)
print(tokenized_text)

['이', '##순', '##신', '##은', '조선']


In [90]:
input_ids = tokenizer.encode(
    text,
    add_special_tokens = False,
    max_length = 5,
    truncation = True
)
print(input_ids)

[9638, 119064, 25387, 10892, 59906]


In [91]:
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

이순신은 조선


In [92]:
print(tokenizer.pad_token)

[PAD]


In [93]:
tokenizer.pad_token_id

0

In [94]:
tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens = False,
    max_length = 15,
    padding = "max_length"
)
print(tokenized_text)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [95]:
input_ids = tokenizer.encode(
    text,
    add_special_tokens = True,
    max_length = 15,
    padding = "max_length"
)
print(input_ids)

[101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102, 0, 0]


In [96]:
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

[CLS] 이순신은 조선 중기의 무신이다. [SEP] [PAD] [PAD]


## 새로운 토큰을 추가해보자!

In [97]:
text = "깟빼뜨랑 리뿔이 뜨럽거 므리커럭이 케쇼쇼쇽 나오애쇼쇼쇼 우뤼갸갸갸 청쇼랴료다혀뚜요"

In [98]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['[UNK]', '리', '##뿔', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '[UNK]', '나', '##오', '##애', '##쇼', '##쇼', '##쇼', '[UNK]', '청', '##쇼', '##랴', '##료', '##다', '##혀', '##뚜', '##요']


In [99]:
input_ids = tokenizer.encode(text)
print(input_ids)

[101, 100, 9238, 119021, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 100, 8982, 28188, 119121, 119060, 119060, 119060, 100, 9751, 119060, 118862, 38688, 11903, 80579, 118841, 48549, 102]


In [100]:
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

[CLS] [UNK] 리뿔이 뜨럽거 므리커럭이 [UNK] 나오애쇼쇼쇼 [UNK] 청쇼랴료다혀뚜요 [SEP]


In [101]:
added_token_num = tokenizer.add_tokens(["깟빼뜨랑","케쇼쇼쇽","우뤼갸갸갸"])
print(added_token_num)

3


In [102]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['깟빼뜨랑', '리', '##뿔', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '케쇼쇼쇽', '나', '##오', '##애', '##쇼', '##쇼', '##쇼', '우뤼갸갸갸', '청', '##쇼', '##랴', '##료', '##다', '##혀', '##뚜', '##요']


In [103]:
text = "[mujun]이순신은 조선 중기의 무신이다.[/mujun]"

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

input_ids = tokenizer.encode(text)
print(input_ids)

decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[', 'mu', '##jun', ']', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[', '/', 'mu', '##jun', ']']
[101, 164, 12361, 46329, 166, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 164, 120, 12361, 46329, 166, 102]
[CLS] [ mujun ] 이순신은 조선 중기의 무신이다. [ / mujun ] [SEP]


In [104]:
added_token_num += tokenizer.add_special_tokens({"additional_special_tokens" : ["[mujun]","[/mujun]"]})

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

input_ids = tokenizer.encode(text)
print(input_ids)

decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[mujun]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[/mujun]']
[101, 119550, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 119551, 102]
[CLS] [mujun] 이순신은 조선 중기의 무신이다. [/mujun] [SEP]


In [109]:
# single input
single_seg_input = tokenizer("이순신은 조선 중기의 무신이다.")

# multiple input
multi_seg_input = tokenizer("이순신은 조선 중기의 무신이다.", "그는 임진왜란을 승리로 이끌었다.")

In [110]:
print("Single segment token(str) : {}".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))
print("Single segment token(int) : {}".format(single_seg_input['input_ids']))
print("Single segment type : {}".format(single_seg_input['token_type_ids']))

# Segments
print()
print("Single segment token(str) : {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Single segment token(int) : {}".format(multi_seg_input['input_ids']))
print("Single segment type : {}".format(multi_seg_input['token_type_ids']))

Single segment token(str) : ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]']
Single segment token(int) : [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]
Single segment type : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Single segment token(str) : ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '.', '[SEP]']
Single segment token(int) : [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 119, 102]
Single segment type : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Bert 모델 테스트

In [111]:
text = "이순신은 [MASK] 중기의 무신이다."
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['이', '##순', '##신', '##은', '[MASK]', '중', '##기의', '무', '##신', '##이다', '.']


In [112]:
from transformers import pipeline

nlp_fill = pipeline('fill-mask', model = model_name)
nlp_fill(text)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.8747122883796692,
  'sequence': '이순신은 조선 중기의 무신이다.',
  'token': 59906,
  'token_str': '조선'},
 {'score': 0.0643644779920578,
  'sequence': '이순신은 청 중기의 무신이다.',
  'token': 9751,
  'token_str': '청'},
 {'score': 0.010954882018268108,
  'sequence': '이순신은 전 중기의 무신이다.',
  'token': 9665,
  'token_str': '전'},
 {'score': 0.004647170193493366,
  'sequence': '이순신은종 중기의 무신이다.',
  'token': 22200,
  'token_str': '##종'},
 {'score': 0.0036106768529862165,
  'sequence': '이순신은기 중기의 무신이다.',
  'token': 12310,
  'token_str': '##기'}]

In [114]:
nlp_fill("독도는 [MASK]의 땅이다")

[{'score': 0.9185790419578552,
  'sequence': '독도는 독 의 땅이다',
  'token': 9088,
  'token_str': '독'},
 {'score': 0.006001222878694534,
  'sequence': '독도는 대한민국 의 땅이다',
  'token': 26168,
  'token_str': '대한민국'},
 {'score': 0.003627519588917494,
  'sequence': '독도는 섬 의 땅이다',
  'token': 9430,
  'token_str': '섬'},
 {'score': 0.0032675538677722216,
  'sequence': '독도는 일본 의 땅이다',
  'token': 23130,
  'token_str': '일본'},
 {'score': 0.0026966023724526167,
  'sequence': '독도는 자 의 땅이다',
  'token': 9651,
  'token_str': '자'}]

In [115]:
tokens_pt = tokenizer(text, return_tensors="pt")

for key, value in tokens_pt.items():
    print("{} : \n\t{}".format(key, value))

outputs = model(**tokens_pt)
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output

print("\nToken wise output : {}, Pooled output : {}".format(last_hidden_state.shape, pooler_output.shape))

input_ids : 
	tensor([[   101,   9638, 119064,  25387,  10892,    103,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
token_type_ids : 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask : 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

Token wise output : torch.Size([1, 13, 768]), Pooled output : torch.Size([1, 768])


In [116]:
last_hidden_state

tensor([[[ 0.1990,  0.2312, -0.5632,  ...,  0.1284, -0.2287,  0.0718],
         [ 0.1618,  0.1146, -0.0999,  ...,  0.3117,  0.3869,  0.6463],
         [ 0.3697,  0.2132, -0.4358,  ...,  0.3572, -0.1115,  0.1626],
         ...,
         [ 0.1121,  0.1447, -0.1658,  ..., -0.0123, -0.0397,  0.1431],
         [ 0.3797,  0.0343, -0.2141,  ...,  0.7549, -0.8043,  0.1917],
         [ 0.3507,  0.1251, -0.2642,  ...,  0.0053, -0.3522,  0.3532]]],
       grad_fn=<NativeLayerNormBackward0>)

In [117]:
pooler_output # 모델의 출력결과

tensor([[ 4.0283e-01, -1.4001e-01, -8.4018e-02, -2.6051e-01,  2.6862e-02,
         -9.4075e-02, -1.3628e-02,  2.4797e-01, -3.0906e-01,  2.3419e-01,
          2.2539e-02,  1.1979e-01, -3.4908e-01, -1.8756e-01,  2.4908e-01,
          1.8911e-01,  3.4478e-01, -1.0767e-01, -2.8204e-01, -1.0324e-01,
         -9.9910e-01, -3.2951e-01, -1.3240e-02, -2.8301e-01, -1.5239e-01,
          1.8199e-01,  9.6231e-02,  3.9528e-01,  2.7834e-01,  9.4189e-03,
         -7.1159e-02, -9.9930e-01,  6.5399e-01,  3.8764e-01,  3.0071e-01,
         -3.6383e-01,  9.8969e-02,  3.0775e-01,  2.4909e-01, -4.0341e-01,
         -5.9351e-02, -6.5761e-02,  6.4013e-02, -2.1495e-01, -2.4099e-01,
         -3.3646e-01, -1.1709e-01,  4.1435e-01, -4.1856e-01, -2.2435e-02,
          2.0174e-01,  2.2109e-01,  4.1770e-01, -1.0774e-01,  1.5332e-01,
          3.7670e-01, -1.2100e-02, -2.4612e-01,  4.7919e-02, -2.9957e-01,
         -1.9029e-01,  1.4763e-01,  7.1556e-02, -1.0966e-01,  7.7137e-04,
         -2.8533e-01,  1.4320e-01, -2.

## vocab을 새롭게 추가했다면, 반드시 model의 embedding layer 사이즈를 늘려주세요!

In [118]:
print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings())

Embedding(119547, 768, padding_idx=0)
Embedding(119552, 768)


## [CLS] 토큰을 활용해 문장의 유사도를 측정할 수 있다.

In [119]:
sent1 = tokenizer("오늘 하루 어떻게 보냈나요?", return_tensors="pt")
sent2 = tokenizer("오늘은 어떤 하루를 보내셨나요?", return_tensors="pt")
sent3 = tokenizer("이순신은 조선 중기의 무신이다.", return_tensors="pt")
sent4 = tokenizer("깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여", return_tensors="pt")

In [120]:
outputs = model(**sent1)
sent_1_pooler_output = outputs.pooler_output

outputs = model(**sent2)
sent_2_pooler_output = outputs.pooler_output

outputs = model(**sent3)
sent_3_pooler_output = outputs.pooler_output

outputs = model(**sent4)
sent_4_pooler_output = outputs.pooler_output

In [121]:
from torch import nn

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
print(cos(sent_1_pooler_output, sent_2_pooler_output))

tensor([0.9757], grad_fn=<DivBackward0>)


In [122]:
print(cos(sent_2_pooler_output, sent_3_pooler_output))

tensor([0.6075], grad_fn=<DivBackward0>)


In [124]:
print(cos(sent_3_pooler_output, sent_4_pooler_output))

tensor([0.5997], grad_fn=<DivBackward0>)


In [125]:
print(cos(sent_1_pooler_output, sent_4_pooler_output))

tensor([0.9258], grad_fn=<DivBackward0>)
