In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('/Users/wangjianyu/Downloads/Bert-base/bert-base-uncased-vocab.txt')

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [3]:
sequence = "A Titan RTX has 24GB of VRAM"

In [7]:
tokenized_sequence = tokenizer.tokenize(sequence)

In [8]:
tokenized_sequence # 这里就是按照Vocab.txt,用WordPiece对句子进行分词

['a', 'titan', 'rt', '##x', 'has', '24', '##gb', 'of', 'vr', '##am']

In [10]:
inputs = tokenizer.encode_plus(sequence, add_special_tokens = True, max_length=20, truncation=True)

In [11]:
inputs # 此时分词工具会返回所有与该模型能够正常工作有关的参数字典 

{'input_ids': [101, 1037, 16537, 19387, 2595, 2038, 2484, 18259, 1997, 27830, 3286, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
encoded_sequence = inputs['input_ids']

In [13]:
encoded_sequence # 这里因为我设置了add_special_tokens=True, 所以会添加CLS 等特殊字符， 返回的是tokenize对应的词的id

[101, 1037, 16537, 19387, 2595, 2038, 2484, 18259, 1997, 27830, 3286, 102]

In [14]:
# if we decode the previous sequence of ids
decoded_sequence = tokenizer.decode(encoded_sequence)

In [15]:
decoded_sequence # return the raw sequence

'[CLS] a titan rtx has 24gb of vram [SEP]'

In [16]:
## 关于可选参数Attention Mask的意义
## 这个参数告诉模型什么tokens应该被attend to, 什么不应该

In [28]:
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

encoded_sequence_a = tokenizer.encode_plus(sequence_a, add_special_tokens = True, max_length=50, truncation=True)['input_ids']
encoded_sequence_b = tokenizer.encode_plus(sequence_b, add_special_tokens = True, max_length=50, truncation=True)['input_ids']


In [29]:
len(encoded_sequence_a), len(encoded_sequence_b)

(8, 19)

In [63]:
# 因为两个有不同的长度，所以我们不能直接混合成一个tensor，第一个sequence必须进行padding，或者第二个要进行截断
padded_sequences = tokenizer.encode_plus(sequence_a,sequence_b,pad_to_max_length=True, add_special_tokens =True, max_length=30)

In [64]:
padded_sequences # 这个就是一般dataset的getitem都这么写，就是借助pad_to_max_length这个参数来补全到设置好的最大的长度

{'input_ids': [101, 2023, 2003, 1037, 2460, 5537, 1012, 102, 2023, 2003, 1037, 2738, 2146, 5537, 1012, 2009, 2003, 2012, 2560, 2936, 2084, 1996, 5537, 1037, 1012, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}

In [65]:
# 可以看出Attention Mask就是说明了哪些属于真实value，哪些属于padding
decoded = tokenizer.decode(padded_sequences['input_ids'])

In [66]:
decoded # 这里可以看出使用两句话一起concat的时候，就是通过SEP进行分割

'[CLS] this is a short sequence. [SEP] this is a rather long sequence. it is at least longer than the sequence a. [SEP] [PAD] [PAD] [PAD] [PAD]'

In [67]:
padded_sequences['token_type_ids'] # 这里就是说清楚每句话从属部分

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0]

In [None]:
# 一般来说使用Bert Embedding作为原始的Word Embedding的话
# 1.这个就是dataset一般的设置方式@Word Embedding必须输入这些@BertModel
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = self.comment_text[index]

        inputs = self.tokenizer.encode_plus(
            comment_text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# 2.Creating the dataset and dataloader for the neural network
MAX_LEN = 256 ## 这里的Max Length就是encode_plus的最大长度, 不过呢由于WordPiece的存在，一般设置比较大，都不会影响的, 约等于max_position_embeddings
train_size = 0.8
train_dataset = train_df.sample(frac=train_size,random_state=7)
valid_dataset = train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_df.shape))

train_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
valid_set = CustomDataset(valid_dataset, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)


TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True}

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True}

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False}

train_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)
test_loader = DataLoader(test_set, **test_params)

In [None]:
# 3. Bert Model
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.config = BertConfig.from_pretrained('../emb/bert-mini/bert_config.json', output_hidden_states=True)
        self.l1 = BertModel.from_pretrained('../emb/bert-mini/pytorch_model.bin', config=self.config)
        self.bilstm1 = torch.nn.LSTM(512, 64, 1, bidirectional=True)
        self.l2 = torch.nn.Linear(128, 64)
        self.a1 = torch.nn.ReLU()
        self.l3 = torch.nn.Dropout(0.3)
        self.l4 = torch.nn.Linear(64, 14)
    
    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooler_output, hidden_states= self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        # sequence_output: sequence of hidden_states at the output of the last layer of the model (batchsize, seq_len, hidden_size)
        # pooler_output: last layer hidden-state of the first token of the sequence, (batch_size, hidden_size)
        # hidden_states: (layers, batch_size, seq_len, hidden_size), each layer of the output
        bs = len(sequence_output) 
        h12 = hidden_states[-1][:,0].view(1,bs,256)
        h11 = hidden_states[-2][:,0].view(1,bs,256)
        concat_hidden = torch.cat((h12,h11),2)
        x, _ = self.bilstm1(concat_hidden)
        x = self.l2(x.view(bs,128))
        x = self.a1(x)
        x = self.l3(x)
        output = self.l4(x)
        return output

net = BERTClass()
net.to(device)