In [None]:
import json
import urllib.parse
import numpy as np
import pandas as pd
from pathlib import Path
import urllib

available_annotations = sorted(Path(".").glob("project-42-at-*.json"))
latest_annotation = available_annotations[-1]
#ls_tasks = json.loads(latest_annotation.read_text())
ls_tasks = json.loads(latest_annotation.read_text(encoding='utf-8'))

def extract_annotations_form_task(task: list[dict]):
    annotations = []
    for task in ls_tasks:
        url = urllib.parse.urlparse(task["data"]["text"])
        query = urllib.parse.parse_qs(url.query)
        d= urllib.parse.unquote(query['d'][0]) 
        data_path = Path(d)
        for annotation in task["annotations"]:
            for result in annotation["result"]:
                if result['from_name'] != 'label' or result['to_name'] != 'text':
                    continue
                value = result['value']
                annotations.append({
                    'task_id': task["id"],
                    'annotation_id' : annotation["id"],
                    'result_id' : result["id"],
                    'file_name': data_path.name,
                    'data_split' : data_path.parent.name,
                    'data_source' : data_path.parent.parent.name,
                    'start_char' : value['start'],
                    'end_char': value['end'],
                    'label' : value['labels'][0],
                })
    return pd.DataFrame(annotations)
    

def load_data_zip(path_to_data: Path):
    import zipfile
    import io
    import pandas as pd
    data = []
    with zipfile.ZipFile(path_to_data, 'r') as zip_ref:
        files = zip_ref.namelist()
        for file in files:
            file_path = Path(file)
            with zip_ref.open(file) as f:
                data.append({
                    'file_name': file_path.name,
                    'data_split' : file_path.parent.name,
                    'data_source' : file_path.parent.parent.name,
                    'text' : f.read().decode('utf-8')
                })
    return pd.DataFrame(data)


def convert_char_positions_to_word_positions(result_id: str, start_char: int, end_char: int, word_mapping: np.ndarray):
    return {
        "result_id" : result_id,
        "start_word" : word_mapping[start_char],
        "end_word" : word_mapping[end_char - 1]
    }


def get_word_mapping(text: str, per_line: bool = False):
    isword = False
    word_index = -1
    word_mapping = []
    words = []
    for char in text:
        if char.isspace() and (char != '\n' or not per_line):
            word_mapping.append(word_index)
            isword = False
        elif char == '\n' and per_line:
            word_mapping.append(word_index)
            isword = False
            word_index = -1
        elif char in [".", ",", "!", "?", ":", ";", '-', '(', ')', '[', ']', '{', '}', '<', '>', '"', "'", "`", "’", "‘", "“", "”", "„", "‟", "‛", "‟", "‹", "›", "«", "»", "—", "="]:
            word_index += 1
            word_mapping.append(word_index)
            words.append(char)
            isword = False
        else:
            if isword:
                word_mapping.append(word_index)
                words[-1] += char
            else:
                isword = True
                word_index += 1
                word_mapping.append(word_index)
                words.append(char)
    word_mapping = np.array(word_mapping, dtype=int)
    assert len(word_mapping) == len(text)
    return ' '.join(words), word_mapping


data = load_data_zip(Path("NER_xmrkva04.zip"))
data[['word_text', 'word_mapping']] = data['text'].apply(lambda x: pd.Series(get_word_mapping(x, per_line=False)))
data = data.set_index(['file_name', 'data_split', 'data_source'])
ls_annotations = extract_annotations_form_task(ls_tasks)
converted_start_ends = []
for i, row in ls_annotations.iterrows():
    data_row = data.loc[(row['file_name'], row['data_split'], row['data_source'])]
    converted_start_ends.append(convert_char_positions_to_word_positions(
        row['result_id'], row['start_char'], row['end_char'], data_row['word_mapping']))
converted_start_ends = pd.DataFrame(converted_start_ends)
ls_annotations = ls_annotations.merge(converted_start_ends, on='result_id')
ls_annotations

In [2]:
unique_task_count = ls_annotations['task_id'].nunique()
print(f"Number of unique tasks: {unique_task_count}")

Number of unique tasks: 3075


In [3]:
def to_char_html(text: str, annotations: pd.DataFrame):
    tag_list = [[] for _ in range(len(text))]
    start_char_tag = '<span style="color: red">'
    end_char_tag = '</span>'
    for i, row in annotations.iterrows():
        tag_list[row['start_char']].append(start_char_tag)
        tag_list[row['end_char']].insert(0,end_char_tag)
    for i in range(len(tag_list)):
        tag_list[i] = ''.join(tag_list[i]) + text[i]
    return ''.join(tag_list)

def to_word_html(text: str, annotations: pd.DataFrame):
    words = text.split(' ')
    tag_list = [[] for _ in range(len(words) + 1)]
    start_char_tag = '<span style="color: red">'
    end_char_tag = '</span>'
    for i, row in annotations.iterrows():
        tag_list[row['start_word']].append(start_char_tag)
        tag_list[row['end_word'] + 1].insert(0, end_char_tag)
    for i in range(len(words)):
        tag_list[i] = ''.join(tag_list[i]) + words[i]
    tag_list[-1] = ''.join(tag_list[-1])
    return ' '.join(tag_list)

In [None]:
c = 0
entities_map = {
    'per' : 'PER'
}


for (file_name, data_split, data_source, label), annotations_df in ls_annotations[ls_annotations['label'].isin(entities_map.keys())].sort_values(by=['file_name']).groupby(['file_name', 'data_split', 'data_source', 'label']):
    data_row = data.loc[(file_name, data_split, data_source)]
    display(HTML(f"<h4>{entities_map[label]} - Chars</h4>" +
            to_char_html(data_row['text'], annotations_df).replace('\n', '<br>')))
    display(HTML(f"<h4>{entities_map[label]} - Words</h4>" +
            to_word_html(data_row['word_text'], annotations_df).replace('\n', '<br>')))
    c += 1
    if c > 4:
        break

In [None]:
gpt_ner_fmt = []

doc_id = 0
per_id = 1
for (file_name, data_split, data_source, label), annotations_df in ls_annotations[ls_annotations['label'] == 'per'].sort_values(by=['file_name']).groupby(['file_name', 'data_split', 'data_source', 'label']):
    data_row = data.loc[(file_name, data_split, data_source)]
    gpt_ner_fmt .append({
        'context': data_row['word_text'],
        "end_position" : [int(x) for x in annotations_df['end_word']],
        "entity_label" : entities_map[label],
        "impossible" : len(annotations_df) == 0,
        "qas_id" : f"{doc_id}.{per_id}",
        "query": "person entities are named persons or family.",
        "span_position": [f'{int(x["start_word"])};{int(x["end_word"])}' for _, x in annotations_df[['start_word', 'end_word']].iterrows()],
        "start_position" : [int(x) for x in annotations_df['start_word']],
        "data_source" : data_source,
        "file_name" : file_name,
        "anotator_id" : data_split
    })

gpt_ner_fmt

In [None]:
gpt_ner_fmt = []

doc_id = 0

# Group without filtering
for (file_name, data_split, data_source), file_annotations in ls_annotations.sort_values(by=['file_name']).groupby(['file_name', 'data_split', 'data_source']):
    data_row = data.loc[(file_name, data_split, data_source)]
    
    # Get only person annotations
    per_annotations = file_annotations[file_annotations['label'] == 'per']
    
    gpt_ner_fmt.append({
        'context': data_row['word_text'],
        "end_position": [int(x) for x in per_annotations['end_word']] if not per_annotations.empty else [],
        "entity_label": entities_map['per'] if not per_annotations.empty else entities_map['per'],
        "impossible": len(per_annotations) == 0,
        "qas_id": f"{doc_id}.{1}",
        "query": "person entities are named persons or family.",
        "span_position": [f'{int(x["start_word"])};{int(x["end_word"])}' for _, x in per_annotations[['start_word', 'end_word']].iterrows()] if not per_annotations.empty else [],
        "start_position": [int(x) for x in per_annotations['start_word']] if not per_annotations.empty else [],
        "data_source": data_source,
        "file_name": file_name,
        "anotator_id": data_split
    })
    
    doc_id += 1

gpt_ner_fmt

In [None]:
with open('Historical-NER-Dataset_gpt_ner_fmt_FULL_42.json', 'w', encoding='utf-8') as f:
    json.dump(gpt_ner_fmt, f, indent=2, ensure_ascii=False)