In [1]:
import torch
from transformers import BertTokenizer, BertModel

In [2]:
import matplotlib.pyplot as plt

In [3]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# WordPiece tokenization이 어떤식으로 진행되는지 확인해 보기
text = "These days word embeddings are important."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'these', 'days', 'word', 'em', '##bed', '##ding', '##s', 'are', 'important', '.', '[SEP]']


In [5]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for token, index in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(token, index))

[CLS]           101
these         2,122
days          2,420
word          2,773
em            7,861
##bed         8,270
##ding        4,667
##s           2,015
are           2,024
important     2,590
.             1,012
[SEP]           102


In [6]:
print(list(tokenizer.vocab.keys())[2000:2010])

['to', 'was', 'he', 'is', 'as', 'for', 'on', 'with', 'that', 'it']


In [7]:
len(tokenizer.vocab.keys())

30522

In [4]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

In [5]:
# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

['[CLS]', 'after', 'stealing', 'money', 'from', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '.', '[SEP]']


In [6]:
len(tokenized_text)

22

In [7]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for token, index in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(token, index))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [8]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [10]:
segments_tensors

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [11]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
outputs = model(tokens_tensor, segments_tensors)
# See https://huggingface.co/transformers/model_doc/bert.html#bertmodel to know what results have been returned

In [14]:
hidden_states = outputs[2]

In [15]:
len(hidden_states)

13

In [16]:
hidden_states[0].shape # Embedding layer

torch.Size([1, 22, 768])

In [21]:
hidden_states[-1].shape # 12th encoder block (the last one)

torch.Size([1, 22, 768])

In [22]:
hidden_states[-1][0][1] # 마지막 encoder block에서 출력하는 'after' 토큰에 대한 hidden state 정보

tensor([-1.3227e-01, -2.7622e-01, -3.4954e-01,  4.0302e-01,  6.2672e-01,
         9.2857e-01,  3.5427e-01,  4.1879e-01,  9.3065e-02, -9.8462e-01,
         4.7324e-01, -2.9774e-01, -1.2410e-01,  2.7019e-01,  1.5415e-01,
        -2.6653e-01,  3.5623e-01,  4.4153e-01, -1.3475e-01, -5.7552e-01,
         1.0842e-01, -7.2393e-01, -4.7583e-01,  3.3128e-01,  1.7051e-01,
         7.4757e-02, -1.2539e-01, -4.8409e-01, -3.3479e-01,  3.1896e-01,
         1.6756e-01, -1.7411e-01,  1.8103e-01, -4.9209e-01, -5.0967e-01,
        -2.2739e-01,  8.4164e-02, -4.9849e-03, -1.1094e-01,  8.3904e-01,
        -4.3651e-01,  1.5201e-01, -1.6974e-01,  5.5162e-01,  6.7165e-01,
        -4.4596e-01,  2.7928e-01, -1.8881e-01,  2.3252e-01, -5.3890e-01,
        -5.1097e-01,  2.4364e-03, -2.9613e-01,  3.1727e-01, -6.3845e-02,
         4.9379e-01, -2.2104e-01,  1.0200e-01,  7.0416e-01, -3.9356e-01,
        -1.3739e-01,  2.2453e-01,  4.6471e-01, -4.9932e-01,  7.5563e-02,
        -5.1134e-02, -6.7853e-02, -2.0015e-01,  2.8

In [17]:
total_encoder_outputs = torch.stack(hidden_states, dim=0)

total_encoder_outputs .size()

torch.Size([13, 1, 22, 768])

In [18]:
# Remove dimension 1, the "batches".
total_encoder_outputs = torch.squeeze(total_encoder_outputs, dim=1)

total_encoder_outputs.size()

torch.Size([13, 22, 768])

In [19]:
# Swap dimensions 0 and 1.
total_encoder_outputs = total_encoder_outputs.permute(1,0,2)

total_encoder_outputs.size()

torch.Size([22, 13, 768])

In [20]:
total_encoder_outputs[6][0]-total_encoder_outputs[10][0]

tensor([ 2.7489e-01, -2.5283e-01, -1.5946e-01,  2.2309e-02,  1.3923e-01,
         1.0140e-01, -1.7607e-01,  3.6460e-01, -2.9223e-01, -3.4522e-01,
        -1.1963e-01,  7.3653e-02, -6.8991e-02,  4.0090e-02, -2.6117e-01,
         2.3941e-01, -5.0902e-02,  4.7517e-02,  3.2660e-01,  1.2581e-01,
         1.4941e-01,  1.3089e-01,  1.2608e-01,  1.4950e-01,  1.0367e-01,
         1.1399e-01, -3.4984e-02,  1.4137e-02, -4.6549e-01,  1.2883e-01,
         9.0494e-02, -3.2566e-01,  2.6344e-01, -3.4416e-01, -1.8339e-01,
        -4.8529e-02, -1.2271e-01,  5.2017e-02, -2.1477e-01,  7.3059e-02,
         7.7150e-02,  8.1869e-02, -2.9393e-01, -1.7131e-01,  2.2285e-01,
        -2.1327e-01,  1.0837e-01,  9.2270e-02,  1.4877e-01, -9.0220e-02,
         2.4494e-01, -1.1656e-01, -1.7681e-01, -6.9747e-01, -2.7377e-02,
        -3.7989e-01,  3.8025e-01,  1.6087e-01,  2.8276e-01, -1.3439e-01,
         5.0344e-02, -3.3081e-02,  1.4906e-01, -2.6149e-01,  1.0969e-02,
        -5.0950e-02, -5.7089e-02, -7.5764e-02,  5.2

In [26]:
len(total_encoder_outputs[1][-4])

768

In [27]:
after_last4_vector = torch.cat((total_encoder_outputs[1][-4], total_encoder_outputs[1][-3], total_encoder_outputs[1][-2], total_encoder_outputs[1][-1]), dim=0)

In [28]:
len(after_last4_vector)

3072

In [29]:
after_sum_last4_vector = torch.sum(total_encoder_outputs[1][-4:], dim=0)

In [30]:
len(after_sum_last4_vector)

768

In [31]:
after_mean_last4_vector  = torch.mean(total_encoder_outputs.permute(1,0,2)[11], dim=0) # 평균값 사용하기

In [44]:
len(after_mean_last4_vector)

768

In [32]:
cls_vector = total_encoder_outputs[0][-1] #첫번째 토큰, 즉 [CLS]에 대한 마지막 encoder의 hidden state

In [33]:
len(cls_vector)

768

In [38]:
bank1_vector = torch.sum(total_encoder_outputs[6][-4:], dim=0)
bank2_vector = torch.sum(total_encoder_outputs[10][-4:], dim=0)
bank3_vector = torch.sum(total_encoder_outputs[19][-4:], dim=0)

In [39]:
type(bank1_vector)

torch.Tensor

In [40]:
bank1_vector.shape

torch.Size([768])

In [41]:
bank1_vector

tensor([ 3.3596e+00, -2.9805e+00, -1.5421e+00,  7.0651e-01,  2.0031e+00,
         6.3182e-01, -2.9078e+00,  1.6307e+00, -1.0581e+00, -2.4467e+00,
         1.5198e-01, -1.8649e+00, -8.6627e-01,  1.5591e+00, -4.4090e+00,
         1.9979e-01, -3.9120e-01,  4.6780e+00,  4.0063e+00,  1.9281e+00,
        -2.6839e+00, -4.5731e-01,  2.9573e+00,  2.0648e+00,  7.1509e-01,
         5.0527e-01,  2.9728e+00,  2.0761e+00,  3.0218e-01,  5.9447e-01,
         3.2789e+00,  1.3924e+00,  1.7653e+00,  3.6563e-01,  4.8986e-01,
        -5.8335e-01, -2.1327e+00,  1.0474e+00, -2.1078e+00,  1.8824e+00,
        -5.2616e-01, -1.4409e+00, -1.2064e-02,  1.6986e+00, -1.1851e-01,
         1.4369e+00, -1.5800e+00,  2.0784e-01, -4.0648e+00, -2.7408e-01,
        -1.2670e+00,  2.7449e+00, -5.6628e+00, -3.8262e+00, -4.5445e-01,
         2.0363e+00, -3.3607e+00, -3.4739e+00,  4.6744e-01, -1.8318e+00,
         2.7669e+00,  1.1167e+00,  3.0754e+00, -4.9868e+00, -1.8789e+00,
        -3.2385e+00,  2.3378e+00,  7.9255e-01, -1.7

In [42]:
bank1_vector = bank1_vector.detach().numpy()
bank2_vector = bank2_vector.detach().numpy()
bank3_vector = bank3_vector.detach().numpy()

In [43]:
bank1_vector

array([ 3.35960817e+00, -2.98053288e+00, -1.54206288e+00,  7.06507444e-01,
        2.00309658e+00,  6.31816864e-01, -2.90779924e+00,  1.63068748e+00,
       -1.05810475e+00, -2.44669199e+00,  1.51981100e-01, -1.86486936e+00,
       -8.66265416e-01,  1.55907416e+00, -4.40901184e+00,  1.99794710e-01,
       -3.91201079e-01,  4.67799616e+00,  4.00633097e+00,  1.92805016e+00,
       -2.68391442e+00, -4.57308561e-01,  2.95727468e+00,  2.06476378e+00,
        7.15093851e-01,  5.05269110e-01,  2.97283244e+00,  2.07611656e+00,
        3.02179068e-01,  5.94466925e-01,  3.27891254e+00,  1.39237666e+00,
        1.76532102e+00,  3.65631282e-01,  4.89861846e-01, -5.83354831e-01,
       -2.13272190e+00,  1.04740405e+00, -2.10780120e+00,  1.88239551e+00,
       -5.26160479e-01, -1.44091487e+00, -1.20639503e-02,  1.69860947e+00,
       -1.18508838e-01,  1.43692791e+00, -1.58004260e+00,  2.07835436e-01,
       -4.06477118e+00, -2.74076313e-01, -1.26704466e+00,  2.74485707e+00,
       -5.66275406e+00, -

In [44]:
import numpy as np

In [45]:
np.dot(bank1_vector, bank2_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank2_vector))

0.9386392

In [46]:
np.dot(bank1_vector, bank3_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank3_vector))

0.69579333

In [47]:
np.dot(bank3_vector, bank2_vector)/(np.linalg.norm(bank3_vector)*np.linalg.norm(bank2_vector))

0.69323605