In [1]:
!pip install transformers --upgrade
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [18]:
from google.colab import userdata
key = userdata.get('hf-api')

from huggingface_hub import login
login(token=key)

In [19]:
from transformers import BertTokenizerFast, BertModel
import torch
from torch import nn


In [20]:
# BERT 토크나이저 사전학습모델 로딩
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
print(tokenizer.tokenize("[CLS] Hello world, how are you?"))


['[CLS]', 'hello', 'world', ',', 'how', 'are', 'you', '?']


In [21]:
print(tokenizer.tokenize("[newtoken] Hello world, how are you?"))
tokenizer.add_tokens(['[newtoken]'])

['[', 'newt', '##oke', '##n', ']', 'hello', 'world', ',', 'how', 'are', 'you', '?']


1

In [22]:
# 토큰을 추가하고 다시 토큰화를 한다.
tokenizer.add_tokens(['[newtoken]'])
tokenizer.tokenize("[newtoken] Hello world, how are you?")

['[newtoken]', 'hello', 'world', ',', 'how', 'are', 'you', '?']

In [23]:
# 토큰값을 확인해 본다.
tokenized = tokenizer("[newtoken] Hello world, how are you?", add_special_tokens=False, return_tensors="pt")
print(tokenized['input_ids'])

tensor([[30522,  7592,  2088,  1010,  2129,  2024,  2017,  1029]])


In [24]:
tkn = tokenized['input_ids'][0, 0]
print("First token:", tkn)
print("Decoded:", tokenizer.decode(tkn))

First token: tensor(30522)
Decoded: [newtoken]


In [25]:
model = BertModel.from_pretrained('bert-base-uncased')
print(model.embeddings)

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [26]:
try:
    out = model(**tokenized)
    out.last_hidden_state
except Exception as e:
    print(e)

index out of range in self


In [27]:
# 임베딩 모델이 추가된 토큰을 학습하지 않았으므로, out of range 에러가 출력될 것이다.
# 다음 코드로 BERT 모델의토큰 공간 크기를 확인해 본다.
weights = model.embeddings.word_embeddings.weight.data
print(weights.shape)


torch.Size([30522, 768])


In [28]:
# 출력은 다음과 같이 30522이다.
torch.Size([30522, 768])

torch.Size([30522, 768])

In [29]:
# 이제 [CLS] 토큰을 임베딩 모델에 추가해보자.
new_weights = torch.cat((weights, weights[101:102]), 0)
new_emb = nn.Embedding.from_pretrained(new_weights, padding_idx=0, freeze=False)
print(new_emb)

# 다음과 같이 30523으로 토큰 크기가 증가되었다.
# Embedding(30523, 768, padding_idx=0)

Embedding(30523, 768, padding_idx=0)


In [30]:
# 새 레이어를 모델 마지막에 추가한다.
model.embeddings.word_embeddings = new_emb
print(model.embeddings)

# 그 결과로 임베딩 모델의 word_embeddings가 업데이트된다.

BertEmbeddings(
  (word_embeddings): Embedding(30523, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [31]:
# 앞의 토큰 시퀀스 리스트를 입력한다. 그럼, 제대로 결과가 출력될 것이다.
out = model(**tokenized)
print(out.last_hidden_state)

tensor([[[-0.2619,  0.2362, -0.0301,  ..., -0.5495,  0.4979,  0.9969],
         [-0.4189,  0.4847,  0.4206,  ..., -0.5477,  0.5529,  0.9332],
         [-0.2770,  0.3686,  0.3143,  ..., -0.7996,  0.6731,  1.0273],
         ...,
         [-0.1673,  0.0158,  0.3666,  ..., -0.7413,  0.9387,  0.8772],
         [-0.2029, -0.1712,  0.3230,  ..., -0.5681,  0.8944,  0.9140],
         [-0.0656, -0.0936,  0.1861,  ..., -0.7238,  0.7219,  1.1424]]],
       grad_fn=<NativeLayerNormBackward0>)


In [32]:
# 다음 코드를 실행하면, 추가된 모델이 동일한 결과를 가지는 것을 알 수 있다.
model = BertModel.from_pretrained('bert-base-uncased')
out2 = model(
    **tokenizer("[CLS] Hello world, how are you?", add_special_tokens=False, return_tensors="pt")
)

In [33]:
out3 = torch.all(out.last_hidden_state == out2.last_hidden_state)
print(out3)

tensor(True)
