In [1]:
from transformers import AutoTokenizer,FlaxAutoModelForMultipleChoice,Trainer,TrainingArguments
import evaluate
from datasets import load_dataset

In [3]:
c3 = load_dataset("clue/clue", "c3")
c3

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 1625/1625 [00:00<00:00, 18406.65 examples/s]
Generating train split: 100%|██████████| 11869/11869 [00:00<00:00, 264008.28 examples/s]
Generating validation split: 100%|██████████| 3816/3816 [00:00<00:00, 201537.00 examples/s]


DatasetDict({
    test: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 1625
    })
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

In [4]:
c3["test"][0]

{'id': 0,
 'context': ['老师把一个大玻璃瓶子带到学校，瓶子里装着满满的石头、玻璃碎片和沙子。之后，老师请学生把瓶子里的东西都倒出来，然后再装进去，先从沙子开始。每个学生都试了试，最后都发现没有足够的空间装所有的石头。老师指导学生重新装这个瓶子。这次，先从石头开始，最后再装沙子。石头装进去后，沙子就沉积在石头的周围，最后，所有东西都装进瓶子里了。老师说：“如果我们先从小的东西开始，把小东西装进去之后，大的石头就放不进去了。生活也是如此，如果你的生活先被不重要的事挤满了，那你就无法再装进更大、更重要的事了。”'],
 'question': '那个任务，学生刚开始完成得怎么样？',
 'choice': ['都没完成', '都装进去了', '完成得很好', '有一组没做完'],
 'answer': ''}

In [5]:
test = c3.pop("test")
c3

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

## 数据预处理

In [6]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [9]:
def process_func(examples):
    contexts = []
    questions_choices = []
    labels = []
    
    for idx in range(len(examples["context"])):
        ctx = "\n".join(examples["context"][idx])
        question = examples["question"][idx]
        choices = examples["choice"][idx]
        for choice in choices:
            contexts.append(ctx)
            questions_choices.append(question + " " + choice)
        if len(choices) < 4 :
            for _ in range(4- len(choices)):
                contexts.append(ctx)
                questions_choices.append(question + " " + "不知道")
        labels.append(choices.index(examples["answer"][idx]))
    
    tokenized_examples = tokenizer(contexts,questions_choices,truncation= "only_first",max_length= 256, padding= "max_length")
    tokenized_examples = {k : [v[i : i+4] for i in range(0,len(v),4)] for k,v in tokenized_examples.items()} #这个转换将每四条数据聚合为一个list，保证矩阵大小一致
    
    tokenized_examples["labels"] = labels
    return tokenized_examples

In [10]:
tokenized_c3 = c3.map(process_func,batched=True)
tokenized_c3

Map: 100%|██████████| 11869/11869 [00:11<00:00, 1040.96 examples/s]
Map: 100%|██████████| 3816/3816 [00:08<00:00, 465.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3816
    })
})

In [11]:
from transformers import AutoModelForMultipleChoice

model = AutoModelForMultipleChoice.from_pretrained("hfl/chinese-macbert-base")


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 评估函数

In [12]:
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metric(pred):
    predictions,refer = pred
    predictions = np.argmax(predictions , axis= -1)
    return accuracy.compute(predictions= predictions,references= refer)

In [24]:
args = TrainingArguments(
    output_dir= "./multipleChoice",
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    eval_strategy= "epoch",
    logging_steps= 10,
    logging_first_step=True,
    logging_strategy= "steps",
    save_strategy= "epoch",
    load_best_model_at_end= True,
    fp16= True
)

In [25]:
trainer = Trainer(
    args= args,
    model = model,
    train_dataset= tokenized_c3["train"],
    eval_dataset= tokenized_c3["validation"],
    compute_metrics= compute_metric
)

In [26]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## 模型预测

In [22]:
import torch

class MultipleChoicePipeLine:
    
    def __init__(self,tokenizer,model):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
    
    def pre_process(self,context,questions,choices):
        ct , qc = [] , []
        for choice in choices:
            ct.append(context)
            qc.append(questions + " " + choice)
        return tokenizer(ct , qc , truncation= "only_first" , max_length= 256 , return_tensors= "pt")
        
    def predict(self, inputs):
        inputs = { k : v.unsqueeze(0).to(self.device) for k,v in inputs.items()}
        return self.model(**inputs).logits
        
    def post_process(self,logits,choices):
        prediction = torch.argmax(logits, dim= -1).cpu().item()
        return choices[prediction]
        
    
    def __call__(self, context,questions,choices):
        inputs = self.pre_process(context,questions,choices)
        logits = self.predict(inputs)
        result = self.post_process(logits,choices)
        return result

In [17]:
pipe = MultipleChoicePipeLine(model=model,tokenizer=tokenizer)

In [23]:
pipe(context= "小明在北京上班" , questions= "小明在哪里上班" , choices= ["北京","上海"])

'北京'