In [27]:
import json
import os
from tqdm import tqdm

In [3]:
def read_file(file_path):
    """Чтение содержимого файла."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [17]:
import json

def parse_ann_file(ann_content):
    """Парсинг .ann файла для извлечения аннотаций, событий и перекрытий."""
    entities = {}
    events = {}
    relations = {}
    overlaps = []

    for line in ann_content.strip().split('\n'):
        cleaned_line = line.replace('\u2005', ' ')
        parts = cleaned_line.split('\t')[:3]
        
        if parts[0].startswith('T'):
            try:
                entity_id, entity_info, entity_text = parts
            except ValueError:
                print(f"Ошибка при обработке строки: {line}")
                continue
            entity_type, *position_parts = entity_info.split(' ')

            positions = []
            all_position = []
            for part in position_parts:
                f = part.split(';')
                all_position.extend(f)
            for i in range(0, len(all_position) - 1, 2):
                positions.append((all_position[i], all_position[i+1]))
            entities[entity_id] = {
                'type': entity_type,
                'positions': positions,
                'text': entity_text
            }
            
        elif parts[0].startswith('R'):
            relation_id, relation_info = parts[0], parts[1]
            relation_type, args = relation_info.split(' ')[0], relation_info.split(' ')[1:]
            if relation_type not in relations:
                relations[relation_type] = []
            arg_sum = []
            for arg in args:
                arg_sum.append(arg.split(':')[1])
            relations[relation_type].append(arg_sum)
            
        elif parts[0].startswith('E'):
            event_id, event_info = parts[0], parts[1]
            event_type, args = event_info.split(':')[0], event_info.split(':')[1:][0].split(' ')[0]
            if event_type not in events:
                events[event_type] = []
            events[event_type].append(args)
            
        elif parts[0].startswith('*'):
            overlap_info = parts[1].split(' ')
            overlap_type, event_ids = overlap_info[0], overlap_info[1:]
            overlaps.append({'type': overlap_type, 'events': event_ids})

    return entities, events, relations, overlaps

def create_json_from_files2(txt_file_path, ann_file_path):
    txt_content = read_file(txt_file_path)
    ann_content = read_file(ann_file_path)

    entities, events, relations, overlaps = parse_ann_file(ann_content)
    words = []
    for entity_id, entity_data in entities.items():
        words.append({
            'text': entity_data['text'],
            'positions': entity_data['positions'],
            'tags': [entity_data['type']],
            'id': entity_id
        })
    
    json_data = {
        'title': txt_content,
        'full_text': txt_content,
        'words': words,
        'events': events,
        'relations': relations,
        'overlaps': overlaps
    }
    return json.dumps(json_data, indent=2, ensure_ascii=False)

In [24]:
import json

def parse_ann_file(ann_content):
    entities = {}
    events = {}
    relations = {}
    overlaps = []

    for line in ann_content.strip().split('\n'):
        cleaned_line = line.replace('\u2005', ' ')
        parts = cleaned_line.split('\t')[:3]

        if parts[0].startswith('T'):
            entity_id, entity_info, entity_text = parts
            entity_type, positions = entity_info.split(' ')[0], ' '.join(entity_info.split(' ')[1:])
            entities[entity_id] = {'type': entity_type, 'positions': positions, 'text': entity_text}
        elif parts[0].startswith('R'):
            relation_id, relation_info = parts[0], parts[1]
            relation_type, arg1, arg2 = relation_info.split(' ')[0], *relation_info.split(' ')[1:]
            arg1_id, arg1_entity = arg1.split(':')
            arg2_id, arg2_entity = arg2.split(':')
            relations[relation_id] = {'type': relation_type, 'arg1': (arg1_id, arg1_entity), 'arg2': (arg2_id, arg2_entity)}
        elif parts[0].startswith('E'):
            event_id, event_info = parts[0], parts[1]
            event_info_parts = event_info.split(':')
            if len(event_info_parts) > 1:
                event_type = event_info_parts[0]
                args = event_info_parts[1].split(' ')
                if len(args) > 1:
                    entity_id = args[0]
                    if event_type not in events:
                        events[event_type] = []
                    events[event_type].append({'id': event_id, 'entity_id': entity_id})
                else:
                    print(f"Некорректный формат аргументов события: {event_info}")
            else:
                print(f"Некорректный формат события: {event_info}")

        elif parts[0].startswith('*'):
            overlap_id, overlap_info = parts[0], parts[1]
            overlap_type, *entity_ids = overlap_info.split(' ')
            overlaps.append({'id': overlap_id, 'type': overlap_type, 'entity_ids': entity_ids})

    return entities, events, relations, overlaps

def create_json_from_files(txt_file_path, ann_file_path):
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()

    txt_content = read_file(txt_file_path)
    ann_content = read_file(ann_file_path)
    
    filename = os.path.basename(txt_file_path)

    entities, events, relations, overlaps = parse_ann_file(ann_content)

    json_data = {
        'title': filename,
        'full_text': txt_content,
        'entities': list(entities.values()),
        'events': events,
        'relations': list(relations.values()),
        'overlaps': overlaps
    }
    return json.dumps(json_data, indent=2, ensure_ascii=False)


In [25]:
txt_file_path = "/home/jupyter/datasphere/project/maccrobat/18/25661749.txt"
ann_file_path = "/home/jupyter/datasphere/project/maccrobat/18/25661749.ann"

json_output = create_json_from_files(txt_file_path, ann_file_path)

In [26]:
output_file_path = "json_output_new.json"
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(json_output)

## Dataset 18

In [29]:
def process_files_in_directory(directory):
    cases = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            base = os.path.splitext(filename)[0]
            print(filename)
            txt_file_path = os.path.join(directory, filename)
            ann_file_path = os.path.join(directory, base + ".ann")
            if os.path.exists(ann_file_path):
                txt_content = read_file(txt_file_path)
                ann_content = read_file(ann_file_path)
                case_json = create_json_from_files(txt_file_path, ann_file_path)
                cases.append(json.loads(case_json))
    return cases

directory_path = "/home/jupyter/datasphere/project/maccrobat/18"
cases_json = process_files_in_directory(directory_path)

15939911.txt
16778410.txt
18236639.txt
18258107.txt
17803823.txt
18416479.txt
18561524.txt
18666334.txt
18787726.txt
18815636.txt
19009665.txt
19214295.txt
19307547.txt
19610147.txt
19816630.txt
19860006.txt
19860007.txt
19860925.txt
20146086.txt
20671919.txt
20977862.txt
21067996.txt
21129213.txt
Некорректный формат аргументов события: Clinical_event:T97
21254744.txt
21308977.txt
21477357.txt
21505579.txt
21527041.txt
21672201.txt
Некорректный формат аргументов события: Diagnostic_procedure:T110
21720478.txt
21923918.txt
22218279.txt
22514576.txt
22515939.txt
22520024.txt
22665582.txt
22719160.txt
22781096.txt
22791498.txt
22814979.txt
23033875.txt
23035161.txt
23076693.txt
23077697.txt
23124805.txt
23155491.txt
23242090.txt
23312850.txt
23468586.txt
23678274.txt
23864579.txt
23897372.txt
24043987.txt
24161539.txt
24294397.txt
24518095.txt
24526194.txt
24654246.txt
24781756.txt
24898994.txt
24957905.txt
25023062.txt
25024632.txt
25139918.txt
25155594.txt
25210224.txt
25246819.txt
2529

In [30]:
def save_cases_to_json(cases, output_file_path):
    """
    Сохранение списка случаев в JSON-файл.
    
    :param cases: Список случаев для сохранения.
    :param output_file_path: Путь к файлу для сохранения.
    """
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(cases, f, ensure_ascii=False, indent=2)
        
output_json_path = "/home/jupyter/datasphere/project/all_cases_output_with_title.json"

save_cases_to_json(cases_json, output_json_path)

## Dataset 20

In [31]:
directory_path = "/home/jupyter/datasphere/project/maccrobat/20"
cases_json = process_files_in_directory(directory_path)

15939911.txt
16778410.txt
17803823.txt
18236639.txt
18258107.txt
18416479.txt
18561524.txt
18787726.txt
18815636.txt
19009665.txt
19214295.txt
19307547.txt
19610147.txt
19816630.txt
19860006.txt
19860007.txt
19860925.txt
20146086.txt
18666334.txt
20671919.txt
20977862.txt
21067996.txt
21129213.txt
21254744.txt
21308977.txt
21477357.txt
21505579.txt
21527041.txt
21672201.txt
21720478.txt
21923918.txt
22218279.txt
22514576.txt
22515939.txt
22520024.txt
22665582.txt
22719160.txt
22781096.txt
22791498.txt
22814979.txt
23033875.txt
23035161.txt
23076693.txt
23077697.txt
23124805.txt
23155491.txt
23242090.txt
23312850.txt
23468586.txt
23678274.txt
23864579.txt
23897372.txt
24043987.txt
24161539.txt
24294397.txt
24518095.txt
24526194.txt
24654246.txt
24781756.txt
24898994.txt
24957905.txt
25023062.txt
25024632.txt
25139918.txt
25155594.txt
25210224.txt
25246819.txt
25293719.txt
25295501.txt
25370695.txt
25410034.txt
25410883.txt
25572898.txt
25661749.txt
25721834.txt
25743872.txt
25759562.txt

In [32]:
output_json_path = "/home/jupyter/datasphere/project/all_cases_output_with_title_20.json"

save_cases_to_json(cases_json, output_json_path)