In [2]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments
from datasets import DatasetDict

In [3]:
dataset = DatasetDict.load_from_disk("../../transformers-code/02-NLP Tasks/11-multiple_choice/c3")

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 1625
    })
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

In [10]:
dataset["train"][0]

{'id': 0,
 'context': ['男：你今天晚上有时间吗?我们一起去看电影吧?', '女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。所以……'],
 'question': '女的最喜欢哪种电影?',
 'choice': ['恐怖片', '爱情片', '喜剧片', '科幻片'],
 'answer': '喜剧片'}

In [8]:
# 从字典中弹出test数据集，该部分没有标签
dataset.pop("test")

Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer'],
    num_rows: 1625
})

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [18]:
# 'id', 'context', 'question', 'choice', 'answer'
def process_func(examples):
    contexts = []
    question_choices = []
    labels = []
    ctx = examples["context"]
    for idx in range(len(ctx)):
        # context字段是一个列表，需要拼接成一个字符串
        context = (" ").join(ctx[idx])
        question = examples["question"][idx]
        choices = examples["choice"][idx]  # choices是一个列表
        for choice in choices:
            # 拼接question和choice
            question_choices.append(question + " " + choice)
            contexts.append(context)
        # 有的choices长度不是4, 需要padding到4
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                contexts.append(context)
                question_choices.append(question + " " + "不知道")

        labels.append(choices.index(examples["answer"][idx]))
    tokenized_example = tokenizer(text=contexts, 
                                  text_pair=question_choices,
                                  truncation="only_first",
                                  max_length=256,
                                  padding="max_length")  # (batch_size * 4, 256)
    # (batch_size * 4, 256) -> (batch_size, 4, 256)
    tokenized_example = {k: [v[i: i + 4] for i in range(0, len(v), 4)]for k, v in tokenized_example.items()}
    tokenized_example["labels"] = labels
    return tokenized_example

In [19]:
tokenized_datasets = dataset.map(process_func, batched=True)

                                                                   

In [25]:
import numpy as np
np.array(tokenized_datasets["train"][:10]["input_ids"]).shape

(10, 4, 256)