In [1]:
from pathlib import Path
from datasets import load_dataset
import datasets
from collections import Counter
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from transformers.data.data_collator import DataCollatorWithPadding

### 加载本地json数据

In [2]:
data_path = '../../data/tnews/'

In [3]:
dataset = load_dataset('json', data_files={
    'train': ['{0}/train.json'.format(data_path)],
    'test': ['{0}/dev.json'.format(data_path)]
})
dataset

Using custom data configuration default-50902a2334524789
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-50902a2334524789/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02)


DatasetDict({
    train: Dataset({
        features: ['label', 'label_desc', 'sentence', 'keywords'],
        num_rows: 53360
    })
    test: Dataset({
        features: ['label', 'label_desc', 'sentence', 'keywords'],
        num_rows: 10000
    })
})

In [4]:
dataset['train'][0]

{'label': '108',
 'label_desc': 'news_edu',
 'sentence': '上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？',
 'keywords': ''}

### label转换为0-index的数字

In [5]:
c = Counter()
c.update(dataset['train']['label'])
c.update(dataset['test']['label'])
label2id = dict(zip(list(c.keys()), list(range(len(c.keys())))))
label2id

{'108': 0,
 '104': 1,
 '106': 2,
 '112': 3,
 '109': 4,
 '103': 5,
 '116': 6,
 '101': 7,
 '107': 8,
 '100': 9,
 '102': 10,
 '110': 11,
 '115': 12,
 '113': 13,
 '114': 14}

In [6]:
dataset = dataset.map(lambda example: {'labels': label2id[example['label']]})
dataset['train'][0]

HBox(children=(FloatProgress(value=0.0, max=53360.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




{'keywords': '',
 'label': '108',
 'label_desc': 'news_edu',
 'labels': 0,
 'sentence': '上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？'}

### 用tokenizer对文本进行分词，并返回transformers forward所需要的参数

In [7]:
pretrained = 'voidful/albert_chinese_tiny'
tokenizer = BertTokenizer.from_pretrained(pretrained, mirror='tuna')

In [8]:
dataset = dataset.map(lambda example: tokenizer(example['sentence']))
dataset

HBox(children=(FloatProgress(value=0.0, max=53360.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'keywords', 'label', 'label_desc', 'labels', 'sentence', 'token_type_ids'],
        num_rows: 53360
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'keywords', 'label', 'label_desc', 'labels', 'sentence', 'token_type_ids'],
        num_rows: 10000
    })
})

In [9]:
# 筛选需要的columns，以及返回pytorch tensor
dataset.set_format(type='pytorch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [10]:
dataset['train'][:3]

  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


{'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1])],
 'input_ids': [tensor([ 101,  677, 6440, 3198, 2110, 4495, 2797, 3322, 1510,  702,  679,  977,
          8024, 5439, 2360,  671, 2584,  722,  678, 2828, 2797, 3322, 3035,  749,
          8024, 2157, 7270, 2897, 1355, 4873, 6375, 5439, 2360, 6608, 8024, 1920,
          2157, 2582,  720, 4692, 2521, 6821, 4905,  752, 8043,  102]),
  tensor([ 101, 1555, 6617, 4384, 4413, 5500,  819, 3300, 7361, 1062, 1385, 1068,
           754, 2454, 3309, 1726, 1908,  677, 3862, 6395, 1171,  769, 3211, 2792,
          2190, 1062, 1385, 8109, 2399, 2399,

In [11]:
# 配合DataLoader使用
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(dataset['train'], batch_size=3, collate_fn=collator)

In [12]:
next(iter(train_loader))

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'input_ids': tensor([[ 101,  677, 6440, 3198, 2110, 4495, 2797, 3322, 1510,  702,  679,  977,
         8024, 5439, 2360,  671, 2584,  722,  678, 2828, 2797, 3322, 3035,  749,
         8024, 2157, 7270, 2897, 1355, 4873, 6375, 5439, 2360, 6608, 8024, 1920,
         2157, 2582,  720, 4692, 2521, 6821, 4905,  752, 8043,  102],
        [ 101, 1555, 6617, 4384, 4413, 5500,  819, 3300, 7361, 1062, 1385, 1068,
          754, 2454, 3309, 1726, 1908,  677, 3862, 6395, 1171,  769, 3211, 2792,
         2190, 1062,