In [None]:
# Use this script to merge two JSON files containing historical NER data.
# To get them, use the 99_Parse_ls_ner_labels.ipynb notebook first


import json
# This script merges two JSON files containing historical NER data.
file1 = 'Historical-NER-Dataset_gpt_ner_fmt_FULL_60.json'
file2 = 'Historical-NER-Dataset_gpt_ner_fmt_FULL_42.json'

with open(file1, 'r', encoding='utf-8') as f1:
    data1 = json.load(f1)

with open(file2, 'r', encoding='utf-8') as f2:
    data2 = json.load(f2)

# Combine the data
merged_data = data1 + data2


In [None]:
# Check for duplicates based on the "context" field and remove them

# track unique
unique_contexts = {}
deduplicated_data = []

# Process records
for record in merged_data:
    context = record.get("context")
    if context and context not in unique_contexts:
        unique_contexts[context] = True
        deduplicated_data.append(record)

print(f"Removed {len(merged_data) - len(deduplicated_data)} duplicate entries")
print(f"Total entries after deduplication: {len(deduplicated_data)}")

Removed 385 duplicate entries
Total entries after deduplication: 4014


In [22]:
# Filter out records with long sentences
max_words = 40
filtered_data = []
removed_count = 0

for record in deduplicated_data:
    context = record.get("context", "")
    # Count words in context
    word_count = len(context.split())
    
    if word_count <= max_words:
        filtered_data.append(record)
    else:
        removed_count += 1

print(f"Removed {removed_count} records with context longer than {max_words} words")
print(f"Total entries after filtering long contexts: {len(filtered_data)}")

print(filtered_data[0])

Removed 2919 records with context longer than 40 words
Total entries after filtering long contexts: 1095
{'context': 'kteří ve šlechetné dobročinnosti spolku hluchoněmých sv . Františka Saleského příspěvky věnovali :', 'end_position': [8, 9], 'entity_label': 'PER', 'impossible': False, 'qas_id': '0.1', 'query': 'person entities are named persons or family.', 'span_position': ['6;8', '9;9'], 'start_position': [6, 9], 'data_source': 'NER_02', 'file_name': '00000001__uuid:09ae4994-0ad9-4071-9f81-7df50a08c84a__r003.txt', 'anotator_id': '02'}


In [None]:
# Split the data into two parts - test_part_amount records for test and the rest for training
# and renumber the "qas_id" field in each part

def renumber_qas_ids(data):
    for i, record in enumerate(data):
        record["qas_id"] = f"{i}.1"
    return data

test_part_amount = 114
test_part = filtered_data[:test_part_amount]
train_part = filtered_data[test_part_amount:]

test_part = renumber_qas_ids(test_part)
train_part = renumber_qas_ids(train_part)

print(f"First part: {len(test_part)} records")
print(f"Second part: {len(train_part)} records")

print("First part sample:")
print(test_part[-1])




First part: 114 records
Second part: 981 records
First part sample:
{'context': 'o K Soloturnu wogjn přiklopotá , zatraubiw dj ta slowa : č Wzali gste Ludwjka do brány , hněwu se třeste Cjsa -', 'end_position': [14], 'entity_label': 'PER', 'impossible': False, 'qas_id': '113.1', 'query': 'person entities are named persons or family.', 'span_position': ['14;14'], 'start_position': [14], 'data_source': 'NER_02', 'file_name': '00000344__uuid:d64ebace-2e95-422d-a3b7-ccb014fb933e__r004.txt', 'anotator_id': '02'}


In [24]:
output_file_test = 'mrc-ner.test.114.filtered'
output_file_train = 'mrc-ner.train.114.filtered'

with open(output_file_test, 'w', encoding='utf-8') as f:
    json.dump(test_part, f, ensure_ascii=False, indent=2)

with open(output_file_train, 'w', encoding='utf-8') as f:
    json.dump(train_part, f, ensure_ascii=False, indent=2)