### datasets.Dataset.map()函数里的batched参数和 torch.utils.data.DataLoader的batch如何配合使用

In [1]:
from datasets import Dataset
from transformers import BertTokenizer
from transformers.data.data_collator import DataCollatorWithPadding
from torch.utils.data import DataLoader

In [2]:
py_data = {
    'label': [0, 1, 2, 1, 0, 1],
    'text': [
        u'老师',
        u'俄罗斯',
        u'中介公司',
        u'天猫定制版',
        u'互联网区块链',
        u'红楼梦'
    ]
}
ds = Dataset.from_dict(py_data)
ds

Dataset({
    features: ['label', 'text'],
    num_rows: 6
})

In [3]:
tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny', mirror='tuna')

In [4]:
# map时候不采用batch的方式
ds.map(lambda example: tokenizer(example['text']))[:]

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




{'attention_mask': [[1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1]],
 'input_ids': [[101, 5439, 2360, 102],
  [101, 915, 5384, 3172, 102],
  [101, 704, 792, 1062, 1385, 102],
  [101, 1921, 4344, 2137, 1169, 4276, 102],
  [101, 757, 5468, 5381, 1277, 1779, 7216, 102],
  [101, 5273, 3517, 3457, 102]],
 'label': [0, 1, 2, 1, 0, 1],
 'text': ['老师', '俄罗斯', '中介公司', '天猫定制版', '互联网区块链', '红楼梦'],
 'token_type_ids': [[0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0]]}

In [5]:
# batched map，每个batch内的数据padding成相同长度
ds.map(lambda example: tokenizer(example['text'], padding=True), batched=True, batch_size=2)[:]

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




{'attention_mask': [[1, 1, 1, 1, 0],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 0],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 0, 0, 0]],
 'input_ids': [[101, 5439, 2360, 102, 0],
  [101, 915, 5384, 3172, 102],
  [101, 704, 792, 1062, 1385, 102, 0],
  [101, 1921, 4344, 2137, 1169, 4276, 102],
  [101, 757, 5468, 5381, 1277, 1779, 7216, 102],
  [101, 5273, 3517, 3457, 102, 0, 0, 0]],
 'label': [0, 1, 2, 1, 0, 1],
 'text': ['老师', '俄罗斯', '中介公司', '天猫定制版', '互联网区块链', '红楼梦'],
 'token_type_ids': [[0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0]]}

In [6]:
# map的batch_size 和 DataLoader batch_size不一致时，DataLoader进行iter的时候会报错
ds_batch_enc = ds.map(lambda example: tokenizer(example['text'], padding=True), batched=True, batch_size=2)
ds_batch_enc.set_format(type='pytorch', columns=['input_ids', 'attention_mask', 'token_type_ids'])
loader = DataLoader(ds_batch_enc, batch_size=3)
for batch in loader:
    print(batch)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




RuntimeError: stack expects each tensor to be equal size, but got [5] at entry 0 and [7] at entry 2

In [7]:
# huggingface datasets里面的说明文档指出，map的时候进行padding也不是最高效的，可以在DataLoader里面进行padding
# ‘‘’Note that this is not the most efficient padding strategy, we could also avoid padding at this stage
#    and use tokenizer.pad as the collate_fn method in the torch.utils.data.DataLoader further below.'''

In [8]:
ds_batch_enc = ds.map(lambda example: tokenizer(example['text'], padding=True))
ds_batch_enc.set_format(type='pytorch', columns=['input_ids', 'attention_mask', 'token_type_ids'])
collator = DataCollatorWithPadding(tokenizer)
loader = DataLoader(ds_batch_enc, batch_size=3, collate_fn=collator)
for batch in loader:
    print(batch)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


{'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1]]), 'input_ids': tensor([[ 101, 5439, 2360,  102,    0,    0],
        [ 101,  915, 5384, 3172,  102,    0],
        [ 101,  704,  792, 1062, 1385,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]])}
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0]]), 'input_ids': tensor([[ 101, 1921, 4344, 2137, 1169, 4276,  102,    0],
        [ 101,  757, 5468, 5381, 1277, 1779, 7216,  102],
        [ 101, 5273, 3517, 3457,  102,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]])}
