In [1]:
import json
import re
import gzip
import pickle
from baseline.data_process import Example

In [2]:
# 读取2019年的原训练数据集
with open('./data/CAIL2019/big_train_data.json', 'r', encoding='utf-8') as reader:
    full_data_2019 = json.load(reader)

In [3]:
def convert_context_to_2020(context_2019):
    '''
    把2019年案例内容根据标点符号分句，构造成2020年的原始标准格式
    '''
    context_2020=[]
    pattern = r',|\.|\:|;|!|\?|:|，|。|：|；|！|？'
    result_list = re.split(pattern, context_2019)
    context_2020.append(result_list[0])
    context_2020.append(result_list)
    return [context_2020]

def get_supporting_facts_sen_id(context_2019, answer_start, answer_text):
    
    '''
    把2019年案例的回答依据，构造成2020年的格式。
    '''
    pattern = r',|\.|\:|;|!|\?|:|，|。|：|；|！|？'
    sen_list = re.split(pattern, context_2019)
    answer_index = 0
    sen_id = {}
    for i, sen in enumerate(sen_list):
        # 在分句后的案例内容中查找行号
        index = sen.find(answer_text)
        if index >=0 :
            sen_id[i] = abs(answer_start - answer_index - index)
        answer_index += len(sen)
    supporting_fact = []
    supporting_fact.append(sen_list[0])
    if len(sen_id) ==0 :
        supporting_fact.append(-1)
    else:
        supporting_fact.append( min(sen_id, key=sen_id.get))
    return [supporting_fact]

train_2019 = []
for i, case_2019 in enumerate(full_data_2019['data']):
    case_2020= {}
    # 遍历qas, 
    question_2019 = {}
    # 先取question_2019为true
    for qa in case_2019['paragraphs'][0]['qas']:
        # 过滤answers为空的question
        if qa['is_impossible']=='true' and  len(qa['answers']) > 0 and qa['answers'][0]['answer_start'] !=-1:
            question_2019=qa
            break
    if len(question_2019)==0:       
        for qa in case_2019['paragraphs'][0]['qas']:
            # 过滤answers为空的question
            if qa['is_impossible']=='false' and len(qa['answers']) > 0 and qa['answers'][0]['answer_start'] !=-1:
                    question_2019=qa
                    answer_txt =qa['answers'][0]['text']
                    # 取出answers的text为yes或no的第一个question
                    if answer_txt.lower() == 'yes' or answer_txt.lower() == 'no':
                        break
    case_2020['_id'] = i+5055
    case_2020['context'] = convert_context_to_2020(case_2019['paragraphs'][0]['context'])
    case_2020['question'] = question_2019['question']
    case_2020['answer'] = question_2019['answers'][0]['text']
    case_2020['supporting_facts'] =get_supporting_facts_sen_id(case_2019['paragraphs'][0]['context'],
                                                           question_2019['answers'][0]['answer_start'],
                                                           question_2019['answers'][0]['text'])
    
    train_2019.append(case_2020)


In [4]:
# 把处理好的2019年数据集，保存到data文件夹下
with open('./data/train_2019.json', 'w', encoding='utf-8') as writer:
    data=json.dumps(train_2019,ensure_ascii=False)
    writer.write(data)

In [5]:
!python baseline/data_process.py \
    --tokenizer_path ./models/chinese_bert_wwm \
    --full_data ./data/train_2019.json \
    --example_output ./output/data/chinese-bert-wwm_2019/train_example.pkl.gz \
    --feature_output ./output/data/chinese-bert-wwm_2019/train_feature.pkl.gz 


  0%|          | 0/8000 [00:00<?, ?it/s]
  7%|▋         | 547/8000 [00:00<00:01, 5429.51it/s]
 15%|█▍        | 1197/8000 [00:00<00:01, 6053.14it/s]
 24%|██▎       | 1897/8000 [00:00<00:00, 6478.14it/s]
 32%|███▏      | 2584/8000 [00:00<00:00, 6610.51it/s]
 41%|████      | 3282/8000 [00:00<00:00, 6716.79it/s]
 49%|████▉     | 3954/8000 [00:00<00:00, 5942.63it/s]
 58%|█████▊    | 4639/8000 [00:00<00:00, 6206.55it/s]
 67%|██████▋   | 5350/8000 [00:00<00:00, 6476.33it/s]
 76%|███████▌  | 6044/8000 [00:00<00:00, 6614.94it/s]
 84%|████████▍ | 6750/8000 [00:01<00:00, 6728.68it/s]
 93%|█████████▎| 7429/8000 [00:01<00:00, 5828.65it/s]
100%|██████████| 8000/8000 [00:01<00:00, 6260.48it/s]

  0%|          | 0/8000 [00:00<?, ?it/s]
  0%|          | 13/8000 [00:00<01:03, 126.04it/s]
  0%|          | 26/8000 [00:00<01:05, 121.39it/s]
  0%|          | 40/8000 [00:00<01:03, 126.01it/s]
  1%|          | 54/8000 [00:00<01:01, 129.06it/s]
  1%|          | 70/8000 [00:00<00:58, 135.72it/s]
  1%|         

In [7]:
!python baseline/data_process.py \
    --tokenizer_path ./models/chinese_bert_wwm \
    --full_data ./data/dev.json \
    --example_output ./output/data/chinese-bert-wwm_2019/dev_example.pkl.gz \
    --feature_output ./output/data/chinese-bert-wwm_2019/dev_feature.pkl.gz 

qid 5
qas type 
doc tokens ['根', '据', '证', '据', '认', '定', '，', '结', '合', '当', '事', '人', '的', '当', '庭', '陈', '述', '、', '质', '证', '及', '辩', '论', '意', '见', '，', '本', '院', '确', '认', '以', '下', '法', '律', '事', '实', '：', '第', '三', '人', '容', '早', '明', '系', '湖', '南', '省', '邵', '东', '县', '简', '家', '陇', '乡', '金', '屋', '江', '村', '金', '家', '湾', '组', '村', '民', '，', '其', '在', '原', '告', '湖', '南', '家', '和', '建', '设', '有', '限', '责', '任', '公', '司', '承', '建', '的', '邵', '东', '邦', '盛', '凤', '凰', '城', '写', '字', '楼', '4', '号', '楼', '建', '筑', '工', '地', '打', '工', '过', '程', '中', '，', '于', '2', '0', '1', '5', '年', '6', '月', '1', '6', '日', '7', '时', '许', '在', '上', '班', '时', '被', '方', '木', '砸', '伤', '，', '导', '致', '其', '右', '肩', '胛', '骨', '骨', '折', '、', '多', '发', '肋', '骨', '骨', '折', '。', '第', '三', '人', '受', '伤', '后', '，', '在', '法', '定', '期', '限', '内', '持', '工', '友', '杜', '强', '林', '及', '病', '历', '资', '料', '等', '证', '据', '向', '被', '告', '申', '请', '工', '伤', '认', '定', '，', '被', '告', '邵', '阳', '市', '人', '社', '局', '受', '理


  0%|          | 0/504 [00:00<?, ?it/s]
100%|██████████| 504/504 [00:00<00:00, 6979.04it/s]

  0%|          | 0/504 [00:00<?, ?it/s]
  3%|▎         | 13/504 [00:00<00:03, 122.94it/s]
  5%|▌         | 26/504 [00:00<00:03, 124.69it/s]
  8%|▊         | 39/504 [00:00<00:03, 125.40it/s]
 11%|█         | 53/504 [00:00<00:03, 127.19it/s]
 13%|█▎        | 66/504 [00:00<00:03, 127.85it/s]
 16%|█▌        | 79/504 [00:00<00:03, 126.33it/s]
 18%|█▊        | 92/504 [00:00<00:03, 126.22it/s]
 21%|██        | 105/504 [00:00<00:03, 127.05it/s]
 24%|██▎       | 119/504 [00:00<00:03, 128.18it/s]
 26%|██▌       | 132/504 [00:01<00:02, 128.59it/s]
 29%|██▉       | 145/504 [00:01<00:02, 127.93it/s]
 31%|███▏      | 158/504 [00:01<00:02, 124.21it/s]
 34%|███▍      | 171/504 [00:01<00:02, 124.93it/s]
 37%|███▋      | 184/504 [00:01<00:02, 125.33it/s]
 39%|███▉      | 197/504 [00:01<00:02, 126.31it/s]
 42%|████▏     | 210/504 [00:01<00:02, 125.40it/s]
 44%|████▍     | 223/504 [00:01<00:02, 126.09it/s]
 47%|█

In [9]:
!python baseline/run_cail.py \
    --name chinese-bert-wwm_2019 \
    --bert_model ./models/chinese_bert_wwm \
    --data_dir ./output/data/chinese-bert-wwm_2019 \
    --batch_size 2 \
    --eval_batch_size 32 \
    --lr 1e-5 \
    --gradient_accumulation_steps 4 \
    --seed 56 \
    --epochs 25