In [None]:
# 需要安装以下包
# pip install pymupdf==1.18.19
# pip install python-docx
# pip install jieba

In [371]:
import os
import json
import pickle

from etils import epath
import pandas as pd
from tqdm import tqdm
import fitz
from docx import Document
from jieba import posseg


def char_pos_tag(text):
    words = pseg.cut(text)  # 使用jieba进行分词
    char_pos_tags = []
    for word, pos in words:
        pos = 1 if pos.startswith('n') else 0
        for char in word:
            char_pos_tags.append((char, pos))
    return char_pos_tags

# CMDD - 多轮实体对话

In [375]:
def deal_file(path, name='', mode='train'):
    with path.open('r') as f:
        data = json.load(f)
        text = ''
        ners = []
        for dialogue in data['dialogue-content']:
            speaker = dialogue['speaker'].strip()
            sentence = dialogue['sentence'].strip()
            text += f'{speaker}:{sentence}\n'
            prefix = (len(speaker) + 1) * [0]
            ner = [1 if l.strip().startswith(('B', 'I')) else 0 for l in dialogue['label']]
            ner = prefix + ner + [0]
            ners.extend(ner)
    assert len(ners) == len(text), print(len(ners), len(text))
    pos_tags = char_pos_tag(text)
    tokens, poss = zip(*pos_tags)
    assert len(tokens) == len(text) == len(poss) == len(ners)
    result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': ners, 'length': len(text), 
              'name': name, 'mode': mode}
    return result

name = 'CMDD'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
for mode in ['train', 'dev', 'test']:
    read_dir = epath.Path('../data/CMDD-多轮实体对话/')  / mode
    files = os.listdir(read_dir)
    print(f'mode: {mode} length: {len(files)}')
    for file in files:
        abs_path = read_dir / file
        data = deal_file(abs_path, name=name, mode=mode)
        writer.write(f'{data}\n')
writer.close()    

mode: train length: 1241
mode: dev length: 413
mode: test length: 413


# IMCS - 多轮实体对话

In [376]:
def deal_file(path, name='', mode='train'):
    results = []
    with path.open('r') as f:
        data = json.load(f)
        for id, message in data.items():
            self_report = message['self_report']
            text = f'患者: {self_report}\n'
            ners = [0] * len(text)
            for dialogue in message['dialogue']:
                speaker = dialogue['speaker'].strip()
                sentence = dialogue['sentence'].strip()
                prefix = (len(speaker) + 1) * [0]
                if mode != 'test':
                    ner = [1 if l.strip().startswith(('B', 'I')) else 0 for l in dialogue['BIO_label'].split()]
                    assert len(ner) == len(sentence), print(len(ner), len(sentence))
                    ner = prefix + ner + [0]
                else:
                    ner = len(f'{speaker}:{sentence}\n') * [0]
                    assert len(ner) == len(sentence) + 4, print(len(ner), len(sentence))
                    
                ners.extend(ner)
                text += f'{speaker}:{sentence}\n'
                
            assert len(ners) == len(text), print(len(ners), len(text))
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss) == len(ners)
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': ners, 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
    return results

name = 'IMCS'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
# test 没有实体标签
for mode in ['train', 'dev', 'test']:
    abs_path = epath.Path('../data/IMCS-IR-多轮实体对话/')  / f'IMCS_{mode}.json'
    data = deal_file(abs_path, name=name, mode=mode)
    print(f'mode: {mode} length: {len(data)}')
    if isinstance(data, list):
        for d in data:
            writer.write(f'{d}\n')
writer.close()    


mode: train length: 1824
mode: dev length: 616
mode: test length: 612


# CY - 多轮对话

In [374]:
def deal_file(path, name='', mode='train'):
    with path.open('rb') as f:
        data = pickle.load(f)
#         print(f'N: {len(data)}')
        results = []
        for d in data:
            history = ''
            ners = []
            for i, his in enumerate(d['history']):
                his += '\n'
                ner = d['tags'][i] + [-1]
                ner = [0 if l == -1 else 1 for l in ner]
                history += his
                ners.extend(ner)
            assert len(history) == len(ners)
            response = d['response']
            ners += [0] * len(response)
            
            text = history + response
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss) == len(ners)
            
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': ners, 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
        return results
     
name = 'CY'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
for mode in ['train', 'dev']:
    read_dir = epath.Path(f'../data/cy-多轮对话/old_dis_pk_dir/{mode}')
    files = os.listdir(read_dir)
    length = 0
    for file in files:
        abs_path = read_dir / file
        data = deal_file(abs_path, name=name, mode=mode)
        length += len(data)
        if isinstance(data, list):
            for d in data:
                writer.write(f'{d}\n')
    print(f'mode: {mode} length: {length}')
writer.close()

mode: train length: 218220
mode: dev length: 2735


# CHIP - 多轮实体对话

In [377]:
def deal_file(path, name='', mode='train'):
    results = []
    with path.open('r') as f:
        for line in f:
            line = json.loads(line)
            text = ''
            ners = []
            for dialogue in line['dialog_info']:
                speaker = dialogue['sender'].strip()
                sentence = dialogue['text']
                prefix = (len(speaker) + 1) * [0]
                text += f'{speaker}:{sentence}\n'
                ner = [0] * len(sentence)
                for r in dialogue['ner']:
                    index = r['range']
                    if index[1] > len(sentence):
                        index[1] = len(sentence)
                    ner[index[0]: index[1]] = [1] *(index[1] - index[0])
                assert len(ner) == len(sentence), print(len(ner), len(sentence), dialogue, ner)
                ner = prefix + ner + [0]
                ners.extend(ner)
            assert len(ners) == len(text), print(ners, text)
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss) == len(ners)
            
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': ners, 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
    return results
            
name = 'CHIP' 
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
# test 没有实体标签
for mode in ['train', 'dev', 'test']:
    abs_path = epath.Path(f'../data/CHIP-MDCFNPC-多轮对话实体/CHIP-MDCFNPC_{mode}.jsonl')
    data = deal_file(abs_path, name=name, mode=mode)
    print(f'mode: {mode} length: {len(data)}')
    if isinstance(data, list):
        for d in data:
            writer.write(f'{d}\n')
writer.close()    


mode: train length: 5000
mode: dev length: 1000
mode: test length: 2000


# fd-subset

In [378]:
def deal_file(path, name='', mode='train'):
    with path.open('rb') as f:
        data = pickle.load(f)
        results = []
        for d in data:
            history = ''
            ners = []
            for i, his in enumerate(d['history']):
                his += '\n'
                ner = d['tags'][i] + [-1]
                ner = [0 if l == -1 else 1 for l in ner]
                history += his
                ners.extend(ner)
            assert len(history) == len(ners)
            response = d['response']
            ners += [0] * len(response)
            
            text = history + response
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss) == len(ners)
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': ners, 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
        return results
     
name = 'fd-subset'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
for mode in ['train', 'dev']:
    abs_path = epath.Path(f'../data/fd-subset-多轮/{mode}.pk')
    data = deal_file(abs_path, name=name, mode=mode)
    print(f'mode: {mode} length: {len(data)}')
    if isinstance(data, list):
        for d in data:
            writer.write(f'{d}\n')
writer.close()

mode: train length: 6317
mode: dev length: 1771


# MedDG-多轮对话

In [379]:
def deal_train_file(path, name='', mode='train'):
    results = []
    with path.open('r') as f:
        data = json.load(f)
        for dialogues in data:
            text = ''
            for dialogue in dialogues:
#                 return dialogue
                speaker = '病人:' if dialogue['id'] == 'Patient' else '医生:'
                sentence = dialogue['Sentence']
                prefix = (len(speaker) + 1) * [0]
                text += f'{speaker}:{sentence}\n'
                    
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss)
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
    return results


def deal_dev_file(path, name='', mode='train'):
    results = []
    with path.open('r') as f:
        data = json.load(f)
        for dialogue in data:
            sentence = dialogue['history']
            output = f'医生:{dialogue["output"]}'
            sentence.append(output)
            text = '\n'.join(sentence)
            
            pos_tags = char_pos_tag(text)
            tokens, poss = zip(*pos_tags)
            assert len(tokens) == len(text) == len(poss)
            result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                      'name': name, 'mode': mode}
            results.append(result)
    return results

name = 'MedDG'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
for mode in ['train', 'dev']:
    abs_path = epath.Path(f'../data/MedDG-多轮对话/MedDG_{mode}.json')
    if mode == 'train':
        data = deal_train_file(abs_path, name=name, mode=mode)
    else:
        data = deal_dev_file(abs_path, name=name, mode=mode)
        
    print(f'mode: {mode} length: {len(data)}')
    if isinstance(data, list):
        for d in data:
            writer.write(f'{d}\n')
writer.close()

mode: train length: 17864
mode: dev length: 2747


# data/cMQA-master/answers.csv

In [380]:
def deal_file(question_path, answers_path, name='', mode='train'):
    questions = pd.read_csv(question_path)
    answers = pd.read_csv(answers_path)
    results = []
    for q_id in tqdm(questions['question_id'].unique()):
        try:
            question = questions[questions['question_id'] == q_id]['content'].item()
            answer = answers[answers['question_id'] == q_id]['content'].item()
        except Exception as error:
            print(f'error: {error} q_id: {q_id}')
            continue
        if not question.strip() or not answer.strip():
            continue
        text = f'患者:{question}\n医生:{answer}'
        
        pos_tags = char_pos_tag(text)
        tokens, poss = zip(*pos_tags)
        assert len(tokens) == len(text) == len(poss)
        result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                  'name': name, 'mode': mode}
        results.append(result)
    return results
     
name = 'cMQA'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
question_path = epath.Path(f'../data/cMQA-master/questions.csv')
answers_path = epath.Path(f'../data/cMQA-master/answers.csv')

data = deal_file(question_path, answers_path, name=name, mode='total')
print(f'mode: total length: {len(data)}')
if isinstance(data, str):
    data = [data]
for d in data:
    writer.write(f'{d}\n')
    
writer.close()

 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 263249/268591 [23:07<00:05, 901.26it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 262945
error: can only convert an array of size 1 to a Python scalar q_id: 262946
error: can only convert an array of size 1 to a Python scalar q_id: 262947
error: can only convert an array of size 1 to a Python scalar q_id: 262948
error: can only convert an array of size 1 to a Python scalar q_id: 262949
error: can only convert an array of size 1 to a Python scalar q_id: 262950
error: can only convert an array of size 1 to a Python scalar q_id: 262951
error: can only convert an array of size 1 to a Python scalar q_id: 262952
error: can only convert an array of size 1 to a Python scalar q_id: 262953
error: can only convert an array of size 1 to a Python scalar q_id: 262954
error: can only convert an array of size 1 to a Python scalar q_id: 262955
error: can only convert an array of size 1 to a Python scalar q_id: 262956
error: can only convert an array of size 1 to a Python scalar q_id: 262957
error: can only convert a

 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 263615/268591 [23:07<00:03, 1365.72it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 263325
error: can only convert an array of size 1 to a Python scalar q_id: 263326
error: can only convert an array of size 1 to a Python scalar q_id: 263327
error: can only convert an array of size 1 to a Python scalar q_id: 263328
error: can only convert an array of size 1 to a Python scalar q_id: 263329
error: can only convert an array of size 1 to a Python scalar q_id: 263330
error: can only convert an array of size 1 to a Python scalar q_id: 263331
error: can only convert an array of size 1 to a Python scalar q_id: 263332
error: can only convert an array of size 1 to a Python scalar q_id: 263333
error: can only convert an array of size 1 to a Python scalar q_id: 263334
error: can only convert an array of size 1 to a Python scalar q_id: 263335
error: can only convert an array of size 1 to a Python scalar q_id: 263336
error: can only convert an array of size 1 to a Python scalar q_id: 263337
error: can only convert a

 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 263989/268591 [23:07<00:02, 1619.36it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 263693
error: can only convert an array of size 1 to a Python scalar q_id: 263694
error: can only convert an array of size 1 to a Python scalar q_id: 263695
error: can only convert an array of size 1 to a Python scalar q_id: 263696
error: can only convert an array of size 1 to a Python scalar q_id: 263697
error: can only convert an array of size 1 to a Python scalar q_id: 263698
error: can only convert an array of size 1 to a Python scalar q_id: 263699
error: can only convert an array of size 1 to a Python scalar q_id: 263700
error: can only convert an array of size 1 to a Python scalar q_id: 263701
error: can only convert an array of size 1 to a Python scalar q_id: 263702
error: can only convert an array of size 1 to a Python scalar q_id: 263703
error: can only convert an array of size 1 to a Python scalar q_id: 263704
error: can only convert an array of size 1 to a Python scalar q_id: 263705
error: can only convert a

 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 264349/268591 [23:08<00:02, 1705.48it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 264070
error: can only convert an array of size 1 to a Python scalar q_id: 264071
error: can only convert an array of size 1 to a Python scalar q_id: 264072
error: can only convert an array of size 1 to a Python scalar q_id: 264073
error: can only convert an array of size 1 to a Python scalar q_id: 264074
error: can only convert an array of size 1 to a Python scalar q_id: 264075
error: can only convert an array of size 1 to a Python scalar q_id: 264076
error: can only convert an array of size 1 to a Python scalar q_id: 264077
error: can only convert an array of size 1 to a Python scalar q_id: 264078
error: can only convert an array of size 1 to a Python scalar q_id: 264079
error: can only convert an array of size 1 to a Python scalar q_id: 264080
error: can only convert an array of size 1 to a Python scalar q_id: 264081
error: can only convert an array of size 1 to a Python scalar q_id: 264082
error: can only convert a

 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 264726/268591 [23:08<00:02, 1797.73it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 264434
error: can only convert an array of size 1 to a Python scalar q_id: 264435
error: can only convert an array of size 1 to a Python scalar q_id: 264436
error: can only convert an array of size 1 to a Python scalar q_id: 264437
error: can only convert an array of size 1 to a Python scalar q_id: 264438
error: can only convert an array of size 1 to a Python scalar q_id: 264439
error: can only convert an array of size 1 to a Python scalar q_id: 264440
error: can only convert an array of size 1 to a Python scalar q_id: 264441
error: can only convert an array of size 1 to a Python scalar q_id: 264442
error: can only convert an array of size 1 to a Python scalar q_id: 264443
error: can only convert an array of size 1 to a Python scalar q_id: 264444
error: can only convert an array of size 1 to a Python scalar q_id: 264445
error: can only convert an array of size 1 to a Python scalar q_id: 264446
error: can only convert a

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 265100/268591 [23:08<00:01, 1834.93it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 264813
error: can only convert an array of size 1 to a Python scalar q_id: 264814
error: can only convert an array of size 1 to a Python scalar q_id: 264815
error: can only convert an array of size 1 to a Python scalar q_id: 264816
error: can only convert an array of size 1 to a Python scalar q_id: 264817
error: can only convert an array of size 1 to a Python scalar q_id: 264818
error: can only convert an array of size 1 to a Python scalar q_id: 264819
error: can only convert an array of size 1 to a Python scalar q_id: 264820
error: can only convert an array of size 1 to a Python scalar q_id: 264821
error: can only convert an array of size 1 to a Python scalar q_id: 264822
error: can only convert an array of size 1 to a Python scalar q_id: 264823
error: can only convert an array of size 1 to a Python scalar q_id: 264824
error: can only convert an array of size 1 to a Python scalar q_id: 264825
error: can only convert a

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 265468/268591 [23:08<00:01, 1831.44it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 265189
error: can only convert an array of size 1 to a Python scalar q_id: 265190
error: can only convert an array of size 1 to a Python scalar q_id: 265191
error: can only convert an array of size 1 to a Python scalar q_id: 265192
error: can only convert an array of size 1 to a Python scalar q_id: 265193
error: can only convert an array of size 1 to a Python scalar q_id: 265194
error: can only convert an array of size 1 to a Python scalar q_id: 265195
error: can only convert an array of size 1 to a Python scalar q_id: 265196
error: can only convert an array of size 1 to a Python scalar q_id: 265197
error: can only convert an array of size 1 to a Python scalar q_id: 265198
error: can only convert an array of size 1 to a Python scalar q_id: 265199
error: can only convert an array of size 1 to a Python scalar q_id: 265200
error: can only convert an array of size 1 to a Python scalar q_id: 265201
error: can only convert a

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 265836/268591 [23:08<00:01, 1832.78it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 265558
error: can only convert an array of size 1 to a Python scalar q_id: 265559
error: can only convert an array of size 1 to a Python scalar q_id: 265560
error: can only convert an array of size 1 to a Python scalar q_id: 265561
error: can only convert an array of size 1 to a Python scalar q_id: 265562
error: can only convert an array of size 1 to a Python scalar q_id: 265563
error: can only convert an array of size 1 to a Python scalar q_id: 265564
error: can only convert an array of size 1 to a Python scalar q_id: 265565
error: can only convert an array of size 1 to a Python scalar q_id: 265566
error: can only convert an array of size 1 to a Python scalar q_id: 265567
error: can only convert an array of size 1 to a Python scalar q_id: 265568
error: can only convert an array of size 1 to a Python scalar q_id: 265569
error: can only convert an array of size 1 to a Python scalar q_id: 265570
error: can only convert a

 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 266204/268591 [23:09<00:01, 1824.36it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 265924
error: can only convert an array of size 1 to a Python scalar q_id: 265925
error: can only convert an array of size 1 to a Python scalar q_id: 265926
error: can only convert an array of size 1 to a Python scalar q_id: 265927
error: can only convert an array of size 1 to a Python scalar q_id: 265928
error: can only convert an array of size 1 to a Python scalar q_id: 265929
error: can only convert an array of size 1 to a Python scalar q_id: 265930
error: can only convert an array of size 1 to a Python scalar q_id: 265931
error: can only convert an array of size 1 to a Python scalar q_id: 265932
error: can only convert an array of size 1 to a Python scalar q_id: 265933
error: can only convert an array of size 1 to a Python scalar q_id: 265934
error: can only convert an array of size 1 to a Python scalar q_id: 265935
error: can only convert an array of size 1 to a Python scalar q_id: 265936
error: can only convert a

 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 266571/268591 [23:09<00:01, 1822.35it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 266291
error: can only convert an array of size 1 to a Python scalar q_id: 266292
error: can only convert an array of size 1 to a Python scalar q_id: 266293
error: can only convert an array of size 1 to a Python scalar q_id: 266294
error: can only convert an array of size 1 to a Python scalar q_id: 266295
error: can only convert an array of size 1 to a Python scalar q_id: 266296
error: can only convert an array of size 1 to a Python scalar q_id: 266297
error: can only convert an array of size 1 to a Python scalar q_id: 266298
error: can only convert an array of size 1 to a Python scalar q_id: 266299
error: can only convert an array of size 1 to a Python scalar q_id: 266300
error: can only convert an array of size 1 to a Python scalar q_id: 266301
error: can only convert an array of size 1 to a Python scalar q_id: 266302
error: can only convert an array of size 1 to a Python scalar q_id: 266303
error: can only convert a

 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 266946/268591 [23:09<00:00, 1849.66it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 266661
error: can only convert an array of size 1 to a Python scalar q_id: 266662
error: can only convert an array of size 1 to a Python scalar q_id: 266663
error: can only convert an array of size 1 to a Python scalar q_id: 266664
error: can only convert an array of size 1 to a Python scalar q_id: 266665
error: can only convert an array of size 1 to a Python scalar q_id: 266666
error: can only convert an array of size 1 to a Python scalar q_id: 266667
error: can only convert an array of size 1 to a Python scalar q_id: 266668
error: can only convert an array of size 1 to a Python scalar q_id: 266669
error: can only convert an array of size 1 to a Python scalar q_id: 266670
error: can only convert an array of size 1 to a Python scalar q_id: 266671
error: can only convert an array of size 1 to a Python scalar q_id: 266672
error: can only convert an array of size 1 to a Python scalar q_id: 266673
error: can only convert a

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 267320/268591 [23:09<00:00, 1853.60it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 267038
error: can only convert an array of size 1 to a Python scalar q_id: 267039
error: can only convert an array of size 1 to a Python scalar q_id: 267040
error: can only convert an array of size 1 to a Python scalar q_id: 267041
error: can only convert an array of size 1 to a Python scalar q_id: 267042
error: can only convert an array of size 1 to a Python scalar q_id: 267043
error: can only convert an array of size 1 to a Python scalar q_id: 267044
error: can only convert an array of size 1 to a Python scalar q_id: 267045
error: can only convert an array of size 1 to a Python scalar q_id: 267046
error: can only convert an array of size 1 to a Python scalar q_id: 267047
error: can only convert an array of size 1 to a Python scalar q_id: 267048
error: can only convert an array of size 1 to a Python scalar q_id: 267049
error: can only convert an array of size 1 to a Python scalar q_id: 267050
error: can only convert a

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 267692/268591 [23:09<00:00, 1838.08it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 267408
error: can only convert an array of size 1 to a Python scalar q_id: 267409
error: can only convert an array of size 1 to a Python scalar q_id: 267410
error: can only convert an array of size 1 to a Python scalar q_id: 267411
error: can only convert an array of size 1 to a Python scalar q_id: 267412
error: can only convert an array of size 1 to a Python scalar q_id: 267413
error: can only convert an array of size 1 to a Python scalar q_id: 267414
error: can only convert an array of size 1 to a Python scalar q_id: 267415
error: can only convert an array of size 1 to a Python scalar q_id: 267416
error: can only convert an array of size 1 to a Python scalar q_id: 267417
error: can only convert an array of size 1 to a Python scalar q_id: 267418
error: can only convert an array of size 1 to a Python scalar q_id: 267419
error: can only convert an array of size 1 to a Python scalar q_id: 267420
error: can only convert a

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 268066/268591 [23:10<00:00, 1851.91it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 267781
error: can only convert an array of size 1 to a Python scalar q_id: 267782
error: can only convert an array of size 1 to a Python scalar q_id: 267783
error: can only convert an array of size 1 to a Python scalar q_id: 267784
error: can only convert an array of size 1 to a Python scalar q_id: 267785
error: can only convert an array of size 1 to a Python scalar q_id: 267786
error: can only convert an array of size 1 to a Python scalar q_id: 267787
error: can only convert an array of size 1 to a Python scalar q_id: 267788
error: can only convert an array of size 1 to a Python scalar q_id: 267789
error: can only convert an array of size 1 to a Python scalar q_id: 267790
error: can only convert an array of size 1 to a Python scalar q_id: 267791
error: can only convert an array of size 1 to a Python scalar q_id: 267792
error: can only convert an array of size 1 to a Python scalar q_id: 267793
error: can only convert a

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 268439/268591 [23:10<00:00, 1844.04it/s]

error: can only convert an array of size 1 to a Python scalar q_id: 268150
error: can only convert an array of size 1 to a Python scalar q_id: 268151
error: can only convert an array of size 1 to a Python scalar q_id: 268152
error: can only convert an array of size 1 to a Python scalar q_id: 268153
error: can only convert an array of size 1 to a Python scalar q_id: 268154
error: can only convert an array of size 1 to a Python scalar q_id: 268155
error: can only convert an array of size 1 to a Python scalar q_id: 268156
error: can only convert an array of size 1 to a Python scalar q_id: 268157
error: can only convert an array of size 1 to a Python scalar q_id: 268158
error: can only convert an array of size 1 to a Python scalar q_id: 268159
error: can only convert an array of size 1 to a Python scalar q_id: 268160
error: can only convert an array of size 1 to a Python scalar q_id: 268161
error: can only convert an array of size 1 to a Python scalar q_id: 268162
error: can only convert a

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 268591/268591 [23:10<00:00, 193.18it/s]


error: can only convert an array of size 1 to a Python scalar q_id: 268516
error: can only convert an array of size 1 to a Python scalar q_id: 268517
error: can only convert an array of size 1 to a Python scalar q_id: 268518
error: can only convert an array of size 1 to a Python scalar q_id: 268519
error: can only convert an array of size 1 to a Python scalar q_id: 268520
error: can only convert an array of size 1 to a Python scalar q_id: 268521
error: can only convert an array of size 1 to a Python scalar q_id: 268522
error: can only convert an array of size 1 to a Python scalar q_id: 268523
error: can only convert an array of size 1 to a Python scalar q_id: 268524
error: can only convert an array of size 1 to a Python scalar q_id: 268525
error: can only convert an array of size 1 to a Python scalar q_id: 268526
error: can only convert an array of size 1 to a Python scalar q_id: 268527
error: can only convert an array of size 1 to a Python scalar q_id: 268528
error: can only convert a

# data/medical-books-master

In [383]:
def find_pdfs(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


def deal_file(path, name='', mode='train'):
    doc = fitz.open(path)
    lenXREF = doc.xrefLength()
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
    length = 0
    results = []
    lines = []
    for i, page in enumerate(doc):
        p = page.get_text().strip()
        for line in p.split('\n'):
            line = line.replace('. .', '')
            line = line.replace('�', '')
            line = line.replace('  ', '')
            if not line.strip(): continue
            length += len(line)
            if length > 1000:
                text = '\n'.join(lines)
                pos_tags = char_pos_tag(text)
                tokens, poss = zip(*pos_tags)
                assert len(tokens) == len(text) == len(poss)
                result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                          'name': name, 'mode': mode}
                results.append(result)
                length = len(line)
                lines = []
            lines.append(line)
    if len(lines):
        text = '\n'.join(lines)
        pos_tags = char_pos_tag(text)
        tokens, poss = zip(*pos_tags)
        assert len(tokens) == len(text) == len(poss)
        result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                  'name': name, 'mode': mode}
        results.append(result)
    return results


pdf_dir = epath.Path("../data/medical-books-master")
pdf_files = find_pdfs(pdf_dir)

name = 'books.pdf' 
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')

for pdf_file in pdf_files[1:]:
    data = deal_file(pdf_file, name=pdf_file, mode='total')
    print(f'length: {len(data)}')
    if isinstance(data, str):
        data = [data]
    for d in data:
        writer.write(f'{d}\n')


文件名:../data/medical-books-master/临床心电图详解与诊断/main.pdf, 页数: 613, 对象: 6947
length: 435
文件名:../data/medical-books-master/X线读片指南/main.pdf, 页数: 403, 对象: 2852
length: 244
文件名:../data/medical-books-master/ICU主治医师手册/main.pdf, 页数: 692, 对象: 7558
length: 793
文件名:../data/medical-books-master/内科治疗指南/main.pdf, 页数: 480, 对象: 3365
length: 444
文件名:../data/medical-books-master/临床药物治疗学/main.pdf, 页数: 336, 对象: 4559
length: 310
文件名:../data/medical-books-master/急诊内科学/main.pdf, 页数: 1904, 对象: 23445
length: 1862
文件名:../data/medical-books-master/药理学/main.pdf, 页数: 36, 对象: 333
length: 17
文件名:../data/medical-books-master/腹部影像解剖图谱/main.pdf, 页数: 157, 对象: 874
length: 10
文件名:../data/medical-books-master/内科疾病鉴别诊断学/main.pdf, 页数: 1268, 对象: 15583
length: 1250
文件名:../data/medical-books-master/免疫学/main.pdf, 页数: 245, 对象: 2060
length: 222
文件名:../data/medical-books-master/精神病学/main.pdf, 页数: 417, 对象: 4038
length: 447
文件名:../data/medical-books-master/病理学/main.pdf, 页数: 355, 对象: 5047
length: 278


In [386]:
def deal_file(docx_path, name='', mode='train'):
    doc = Document(docx_path)
    results = []
    length = 0
    lines = []
    for paragraph in doc.paragraphs:
        for line in paragraph.text.split('\n'):
            line = line.replace('. .', '')
            line = line.replace('�', '')
            line = line.replace('  ', '')
            if not line.strip(): continue
            length += len(line)
            if length > 1000:
                text = '\n'.join(lines)
                pos_tags = char_pos_tag(text)
                tokens, poss = zip(*pos_tags)
                assert len(tokens) == len(text) == len(poss)
                result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                          'name': name, 'mode': mode}
                results.append(result)
                length = len(line)
                lines = []
            lines.append(line)
    if len(lines):
        text = '\n'.join(lines)
        pos_tags = char_pos_tag(text)
        tokens, poss = zip(*pos_tags)
        assert len(tokens) == len(text) == len(poss)
        result = {'text': text, 'tokens': tokens, 'poss': poss, 'ners': [], 'length': len(text), 
                  'name': name, 'mode': mode}
        results.append(result)
    return results


path = '../data/医疗书word/14748064.docx'
text = deal_file(path, name=path, mode='total')

name = 'books.docx'
write_dir = epath.Path('data')
os.makedirs(write_dir, exist_ok=True)
writer_path = write_dir / f'{name}.jsonl'
writer = writer_path.open('w')
read_dir = epath.Path('../data/医疗书word/')
files = os.listdir(read_dir)
for file in files:
    abs_path = read_dir / file
    data = deal_file(abs_path, name=name, mode='total')
    print(f'file: {file} length: {len(data)}')
    if isinstance(data, str):
        data = [data]
    for d in data:
        writer.write(f'{d}\n')

writer.close()    

file: 14748064.docx length: 362
file: 糖尿病中医防治指南解读_12333379.docx length: 717
file: 仝小林经方新用十六讲.docx length: 284
file: 维新医集  仝小林中医新论.docx length: 129
file: 脾瘅新论  代谢综合征的中医认识及治疗_14568528.docx length: 346
file: 糖尿病并发症中医诊疗学_14421290.docx length: 428
file: 方药量效学_13279812.docx length: 420
file: 糖尿病并发症中医诊疗学.docx length: 426
file: 名老中医糖尿病辨治枢要.docx length: 193
file: 方药量效关系名医汇讲.docx length: 136
file: 糖尿病中医药临床循证实践指南_14056749(1).docx length: 247
file: 糖尿病中医药临床循证实践指南_14056749.docx length: 247
