In [2]:
# Use this script to merge two JSON files containing historical NER data.
# To get them, use the 99_Parse_ls_ner_labels.ipynb notebook first


import json
# This script merges two JSON files containing historical NER data.
file1 = 'Historical-NER-Dataset_gpt_ner_fmt_FULL_60.json'
file2 = 'Historical-NER-Dataset_gpt_ner_fmt_FULL_42.json'

with open(file1, 'r', encoding='utf-8') as f1:
    data1 = json.load(f1)

with open(file2, 'r', encoding='utf-8') as f2:
    data2 = json.load(f2)

# Combine the data
merged_data = data1 + data2

In [None]:
# split the merged data into three groups based on the annotator_id
# The annotator_id is expected to be a string, so we check for '1', '2', '3' and '01', '02', '03'

annotator_1_data = []
annotator_2_data = []
annotator_3_data = []

for entry in merged_data:
    if 'anotator_id' in entry:
        annotator_id = entry['anotator_id']
        
        if annotator_id in ['1', '01']:
            annotator_1_data.append(entry)
        elif annotator_id in ['2', '02']:
            annotator_2_data.append(entry)
        elif annotator_id in ['3', '03']:
            annotator_3_data.append(entry)

# find the common contexts across all three annotators
contexts_annotator_1 = set(entry['context'] for entry in annotator_1_data)
contexts_annotator_2 = set(entry['context'] for entry in annotator_2_data)
contexts_annotator_3 = set(entry['context'] for entry in annotator_3_data)
common_contexts = contexts_annotator_1.intersection(contexts_annotator_2, contexts_annotator_3)

print(f"Total unique contexts in annotator 1 data: {len(contexts_annotator_1)}")
print(f"Total unique contexts in annotator 2 data: {len(contexts_annotator_2)}")
print(f"Total unique contexts in annotator 3 data: {len(contexts_annotator_3)}")
print(f"Number of contexts common to all three annotators: {len(common_contexts)}")


# Filter each annotator's data to include only entries with common contexts
filtered_annotator_1_data = [entry for entry in annotator_1_data if entry['context'] in common_contexts]
filtered_annotator_2_data = [entry for entry in annotator_2_data if entry['context'] in common_contexts]
filtered_annotator_3_data = [entry for entry in annotator_3_data if entry['context'] in common_contexts]

print(f"Filtered annotator 1 data: {len(filtered_annotator_1_data)} entries")
print(f"Filtered annotator 2 data: {len(filtered_annotator_2_data)} entries")
print(f"Filtered annotator 3 data: {len(filtered_annotator_3_data)} entries")




Total unique contexts in annotator 1 data: 1356
Total unique contexts in annotator 2 data: 1134
Total unique contexts in annotator 3 data: 1441
Number of contexts common to all three annotators: 121
Filtered annotator 1 data: 121 entries
Filtered annotator 2 data: 121 entries
Filtered annotator 3 data: 121 entries
last data for annotator 3:
{'context': 'členů Kinter Marcus , dr . , Raj - hrad 3 Knížek Jan B . , Libáň 33 Knüpfer Alfred , Broumov 11 Kober I . L . , Praha 34 Kopřiva J . F . , Pelhřimov 9 Kormout Lud . , Ústí nad Orlicí 23 Kotzura Karel , Klatovy 12 Kostka Č . , Zahrádka 3 Kračelík , Uh . Hradiště 5 Krause F . , Jílemnice 65 Kriegler F . , Jaroměř 13 Krušina J . , Červený Ko - 6 stelec Křeček J . , Česká Skalice 25 Křepelka J . , Končanice 4 Kučera J . , Napajedly 3 Kuchta T . , Orlov 11 Kupka J . , Třebechovice 18 Kvasnička J . V . , Třeboň 32 Lahodný , Popovice 3 Lácha M . J . , Netolice 20 3 Leger F . , Lužany Leicht B . , Budohostice 12 Liblinský J . , Královice 13 Lin

In [6]:
with open('common_annotator_1_data.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_annotator_1_data, f, ensure_ascii=False, indent=2)

with open('common_annotator_2_data.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_annotator_2_data, f, ensure_ascii=False, indent=2)

with open('common_annotator_3_data.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_annotator_3_data, f, ensure_ascii=False, indent=2)