In [1]:
# https://github.com/google-research/bert

In [2]:
# Speical token
# [CLS]: 문장의 시작
# [SEP]: 문장의 끝
# [PAD]: 문장의 여백 (문장의 길이가 다양하기 때문에 max_seq_length로 맞춰주기 위한 작업)

# BERT 논문을 읽고 실습하기

In [3]:
''' 1. import Module '''
from transformers import BertModel, BertTokenizer
import torch

In [19]:
# 모델 불러오기
model = BertModel.from_pretrained('bert-base-uncased') # bert-base-uncased 모델 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # bert-base-uncased를 학습하는데 이용한 tokenizer 가져오기

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
''' 2. 전처리하기 '''
sentence = 'I love Paris'
tokens = tokenizer.tokenize(sentence) # token 얻기
print(tokens)

['i', 'love', 'paris']


In [7]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]']

In [8]:
# 만약 토큰의 최대 길이가 7이라면
tokens = tokens + ['[PAD]'] + ['[PAD]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']

In [9]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens] # pad토큰은 0으로 설정 (masking 작업)
attention_mask

[1, 1, 1, 1, 1, 0, 0]

In [11]:
token_ids = tokenizer.convert_tokens_to_ids(tokens) # 토큰이 고유한 토큰 ID에 매핑
token_ids

[101, 1045, 2293, 3000, 102, 0, 0]

In [12]:
token_ids = torch.tensor(token_ids).unsqueeze(0) # 1D => 2D
attention_mask = torch.tensor(attention_mask).unsqueeze(0) # 1D => 2D

In [23]:
''' 3. 임베딩 추출하기 '''
# return_dict=False를 안 써도 된다.
# 그러나, 최신버전(3.x 이상)에서 이를 적용을 안할 경우, str 형태로 return값이 전해져 .shape가 작동이 안될 수도 있다.
hidden_rep, cls_head = model(token_ids, attention_mask = attention_mask, return_dict=False) 

In [24]:
print(hidden_rep.shape) # 입력에 대한 모든 토큰의 임베딩을 표함하고 있다. 
print(cls_head.shape) # [CLS] 토큰은 문장 전체의 표현을 보유하고 있다.

torch.Size([1, 7, 768])
torch.Size([1, 768])


In [25]:
''' 3-x. 만약 모든 encoder layer에서 임베딩 벡터를 얻고 싶다면? '''
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) # output_hidden_states가 필요함.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
sentence = 'I love Paris'
tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens = tokens + ['[PAD]'] + ['[PAD]']
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [27]:
last_hidden_state, pooler_output, hidden_states = model(token_ids, attention_mask = attention_mask, return_dict=False)

In [31]:
print(last_hidden_state.shape)
print(pooler_output.shape)
print(len(hidden_states)) # 입력 임베딩 레이어 h0 ~ 인코더 레이어(N=12)를 모두 포함한다. (13개) // 자료형: tuple
print(hidden_states[0].shape) # torch.Size([1,7,768]) // [batch_size, max_seq_length, hidden_size]

torch.Size([1, 7, 768])
torch.Size([1, 768])
13
torch.Size([1, 7, 768])
