In [181]:
import os
import sys
import json
import pandas as pd
from lxml import etree
from tqdm import tqdm

In [184]:
dataset = '../dataset/Эпикризы_финал_11_10_2023/'
json_name = dataset + 'Эпикриз_211358327_v1_result.json'
xml_name = dataset + 'Эпикриз_211358327_v1.xml'

with open(json_name, 'r') as f:
    test_labels = json.load(f)

test_labels[0]

{'id': 'P_Ubr_uO2isWwO6FxeG7d',
 'end': 339,
 'code': '',
 'name': 'гипергликемия - 12 ммоль/л',
 'start': 313,
 'xPath': '/ClinicalDocument/component/structuredBody/component[4]/section/text',
 'decorCode': 'symptom'}

In [186]:
tree = etree.parse(xml_name)
root = tree.getroot()

for i, e in enumerate(root.iter()):
    if i == 15:
        break
    print(tree.getpath(e))

/*
/*/comment()[1]
/*/comment()[2]
/*/comment()[3]
/*/comment()[4]
/*/comment()[5]
/*/comment()[6]
/*/*[1]
/*/comment()[7]
/*/*[2]
/*/comment()[8]
/*/comment()[9]
/*/comment()[10]
/*/*[3]
/*/comment()[11]


In [141]:
def get_text(tree, symptom, start, end):
    """
    Inputs:
      tree: xml tree
      symptom: text of symptom
      start: start's pointer
      end: end's pointer
    Return
      text: text with symptom
    """

    nodes = tree.xpath(f'//*[contains(text(), "{symptom}")]')
    if len(nodes) == 1:
        return nodes[0].text
    for node in nodes:
        text = node.text.lstrip()
        if len(text) < end:
            continue
        if text[start:end] == symptom:
            return text
    

In [153]:
def proccess_json(json_name, xml_name):
    """
    Return:
    - result: single json dict for label studio
    """
    tree = etree.parse(xml_name)
    with open(json_name, 'r') as f:
        labels = json.load(f)

    result = {}
    for label in labels:
        symptom = label['name']
        start = label['start']
        end = label['end']
        text = get_text(tree, symptom, start, end)
        if text not in result.keys():
            result[text] = {
                "annotations": [{"result": []}],
                "data": {"text": text}
                }
        annotation = {
            "value": {
                "start": start,
                "end": end,
                "text": symptom,
                "labels": [
                    "Симптом"
                  ]
            },
            "from_name": "label",
            "to_name": "text",
            "type": "labels",
            "origin": "manual"
        }
        result[text]['annotations'][0]['result'].append(annotation)
    
    return list(result.values())

In [175]:
def proccess_dataset(dataset):
    result = []
    for path in tqdm(os.listdir(dataset)):
        if path [-1] != 'l':
            continue
        xml_name = dataset + path
        json_name = xml_name[:-4] + '_result.json'
        result.extend(proccess_json(json_name, xml_name))
    return result

In [176]:
result = proccess_dataset(dataset)

100%|██████████| 196/196 [00:00<00:00, 204.74it/s]


In [179]:
with open('../dataset/final_labels.json', 'w+') as f:
    json.dump(result, f, ensure_ascii=False)