In [1]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM , BertForNextSentencePrediction
from pytorch_pretrained_bert import BertConfig
# config = BertConfig(max_position_embeddings=512)

# bert_config = BertConfig(vocab_size_or_config_json_file=30522,
#                          type_vocab_size=2,
#                          num_labels=1000,
#                          hidden_size=128,
#                          num_hidden_layers=2,
#                          num_attention_heads=8,
#                          intermediate_size=256,
#                          hidden_dropout_prob=0.01,
#                          max_position_embeddings=1000,
#                          attention_probs_dropout_prob=0.01
#                          )


# Embedding/Bert

In [2]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained('bert-base-uncased')
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

embedding_matrix = get_bert_embed_matrix() # Bert word embedding weights
embedding_matrix.shape

(30522, 768)

# tokenizer 裡頭的字典資訊

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
vocab = tokenizer.vocab # word_to_id
print("字典大小：", len(vocab),'june' in vocab)

vocab['june'] # word2id

tokenizer.max_len

字典大小： 30522 True


512

In [4]:
# help(vocab)
list(vocab.items())[1012][0]
# list(vocab.items())

vocab_dict = {}
for v , k in vocab.items():
    vocab_dict[k] = v
    
vocab_dict[1012]   

'.'

In [5]:
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

token               index          
-------------------------
unusually           12890
pope                 4831
福                    1926
pairing             22778
∅                    1593
tang                 9745
##pic               24330
goodbye              9119
appears              3544
##loh               24729


# BertModel embedding

In [20]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
# BERT
model = BertModel.from_pretrained('bert-base-uncased',max_position_embeddings = 1024)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 3)

model # 主任務模型
'''
Bert的詞向量主要是由三個向量相加組合而成，
1.分別是單詞本身的向量，
2.單詞所在句子中位置的向量
3.句子所在單個訓練文本中位置的向量。
'''
model.embeddings # 由主任務模型接出的 bert embeddings


TypeError: __init__() got an unexpected keyword argument 'max_position_embeddings'

In [19]:
help(BertModel)

Help on BertModel in module pytorch_pretrained_bert.modeling object:

class BertModel(BertPreTrainedModel)
 |  BERT model ("Bidirectional Embedding Representations from a Transformer").
 |  
 |  Params:
 |      config: a BertConfig class instance with the configuration to build a new model
 |  
 |  Inputs:
 |      `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 |          with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 |          `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 |      `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 |          types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 |          a `sentence B` token (see BERT paper for more details).
 |      `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 |          selected in [0, 1]. It's 

# Bert 句子斷詞

In [7]:
# 有 ## 前綴的 tokens 即為 wordpieces。
# 以詞彙 fragment 來說，其可以被拆成 frag 與 ##ment 兩個 pieces，
# 而一個 word 也可以獨自形成一個 wordpiece。wordpieces 可以由蒐集大量文本並找出其中常見的 pattern 取得

In [8]:
# 除了一般的 wordpieces 以外，BERT 裡頭有 5 個特殊 tokens 各司其職：

# [CLS]：在做分類任務時其最後一層的 repr. 會被視為整個輸入序列的 repr.
# [SEP]：有兩個句子的文本會被串接成一個輸入序列，並在兩句之間插入這個 token 以做區隔
# [UNK]：沒出現在 BERT 字典裡頭的字會被這個 token 取代
# [PAD]：zero padding 遮罩，將長度不一的輸入序列補齊方便做 batch 運算
# [MASK]：未知遮罩，僅在預訓練階段會用到

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)

text = 'Terrific Book for Learning the Art of Crochet'
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)
print(tokens[:10], '...')
print(ids[:10], '...')

Terrific Book for Learning the Art of Crochet
['terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t'] ...
[27547, 2338, 2005, 4083, 1996, 2396, 1997, 13675, 23555, 2102] ...


# create an instance of BertModel initialized with pre-trained weights

In [10]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
# returns the base model (the one with 12 layers) pre-trained on uncased sequences
BertModel = BertModel.from_pretrained('bert-base-uncased') 

# 1.Tokenize the sequence:

In [11]:
sentence = 'Terrific Book for Learning the Art of Crochet'
tokens = tokenizer.tokenize(sentence)
print(tokens)

['terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t']


# 2. Add [CLS] and [SEP] tokens:

In [12]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t', '[SEP]']


# 3. Padding the input:

In [13]:
T = 15 # maximum length
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
print(padded_tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(attn_mask)
# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

['[CLS]', 'terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t', '[SEP]', '[PAD]', '[PAD]', '[PAD]']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]


# 4. Maintain a list of segment tokens:

In [14]:
seg_ids = [0 for _ in range(len(padded_tokens))] #Since we only have a single sequence as input
seg_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# 5. Obtaining indices of the tokens in BERT’s vocabulary:

In [15]:
# Obtaining indices for each token
sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print(sent_ids)

[101, 27547, 2338, 2005, 4083, 1996, 2396, 1997, 13675, 23555, 2102, 102, 0, 0, 0]


# Putting all these steps together:

In [16]:
# Indices of input sequence tokens in the vocabulary. To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:

# For sequence pairs:
# tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]

# For single sequences:
# tokens:         [CLS] the dog is hairy . [SEP]

In [17]:
# from transformers import PreTrainedTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)

SENTENCE_A = "Perfect for beginners and already knowledgeable crocheters alike!"
# SENTENCE_A = SENTENCE_A.split(" ")

In [18]:
import torch

sentence = 'Terrific Book for Learning the Art of Crochet .'
#Step 1: Tokenize
tokens = tokenizer.tokenize(sentence)
# print('tokens:',tokens)
#Step 2: Add [CLS] and [SEP]
# [CLS]：在做任務時其最後一層的 repr. 會被視為整個輸入序列的 repr.
# [SEP]：有兩個句子的文本會被串接成一個輸入序列，並在兩句之間插入這個 token 以做區隔
tokens = ['[CLS]'] + tokens + ['[SEP]'] + tokens + ['[SEP]']
#Step 3: Pad tokens
T = 20 # maximum length
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print('padded_tokens:',padded_tokens)
#Step 4: Segment ids
# seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
seg_ids = []
ap_mode = 0
for token in tokens:
    if token == '[SEP]':
        if ap_mode == 1: seg_ids.append(0) ; ap_mode = 0 
        elif ap_mode == 0: seg_ids.append(0) ; ap_mode = 1    
    elif token == '[CLS]': seg_ids.append(0)
    else:  seg_ids.append(ap_mode)
        
#Step 5: Get BERT vocabulary index for each token
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print('seg_ids',seg_ids)

#Converting everything to torch tensors before feeding them to bert_model
token_ids = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, 12]
attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
seg_ids   = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]

#Feed them to bert
bert_config = BertConfig(vocab_size_or_config_json_file=30522,
                         type_vocab_size=2,
                         num_labels=len(label_list),
                         hidden_size=128,
                         num_hidden_layers=2,
                         num_attention_heads=8,
                         intermediate_size=256,
                         hidden_dropout_prob=0.01,
                         max_position_embeddings=128,
                         attention_probs_dropout_prob=0.01
                         )

bert_model = BertModel.from_pretrained('bert-base-uncased')
# hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,\
#                                   token_type_ids = seg_ids, \
#                                    output_all_encoded_layers=True)

hidden_reps, cls_head = bert_model(token_ids)

# hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask)
print(hidden_reps[0].shape)
#Out: torch.Size([1, 12, 768])
print(cls_head[0].shape)
#Out: torch.Size([1, 768])

'''
1.(hidden_reps) contains the hidden states of each token in the input sequence after feeding them 
through a series of self-attention layers. 


2.(cls_head) contains the hidden representation of just the ‘[CLS]’ token after additionally being 
passed to a fully connected layer with tanh activation function.
'''

'''
class BertModel(BertPreTrainedModel)
 |  BERT model ("Bidirectional Embedding Representations from a Transformer").
 |  
 |  Params:
 |      config: a BertConfig class instance with the configuration to build a new model
 |  
 |  Inputs:
 |      `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 |          with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 |          `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 |      `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 |          types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 |          a `sentence B` token (see BERT paper for more details).
 |      `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 |          selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 |          input sequence length in the current batch. It's the mask that we typically use for attention when
 |          a batch has varying length sentences.
 |      `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
 |  
 |  Outputs: Tuple of (encoded_layers, pooled_output)
 |      `encoded_layers`: controled by `output_all_encoded_layers` argument:
 |          - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
 |              of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
 |              encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
 |          - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
 |              to the last attention block of shape [batch_size, sequence_length, hidden_size],
 |      `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
 |          classifier pretrained on top of the hidden state associated to the first character of the
 |          input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
'''

''

padded_tokens: ['[CLS]', 'terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t', '.', '[SEP]', 'terrific', 'book', 'for', 'learning', 'the', 'art', 'of', 'cr', '##oche', '##t', '.', '[SEP]']
seg_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]


NameError: name 'label_list' is not defined

# https://medium.com/swlh/painless-fine-tuning-of-bert-in-pytorch-b91c14912caa
# 拜讀改code