Script that merges annotations made in Lawnotation.org. The input data need to have the same labelset.

In [3]:
import os

In [4]:
# Specify input and output folders
input_folder = r'./input'
output_file = r'./output/merged_annotations.json'

os.listdir(input_folder)

['annotations_gpt-5.1_2025-11-20_15-13-39_fixed.json']

In [5]:
# annotator_mapping = {
#     "file1.json": {1: 2, 2: 1},  # Swap annotators 1 and 2 in file1.json
#     "file2.json": {3: 1, 2: 3}   # Remap annotators in file2.json
# }
annotator_mapping = {}

# output_file = r"C:\Users\gijs.vandijck\Downloads\test_GDPR\merged_annotations.json"

# merge_json_files(input_folder, output_file, annotator_mapping)

In [6]:
# # Provide a mapping in the form of a JSON string, such as {"1": 2, "2": 3}, or leave it blank if no mapping is needed.
# ## If a document in File 1 has an assignment from annotator: 1, the script will change it to annotator: 2 based on the mapping { "1": 2, "2": 3 }. Similarly, annotator: 2 will be changed to annotator: 3 in both files.
# annotator_mapping = {} #if desired, enter annotator mapping as JSON (e.g., {\"1\": 2, \"3\": 2}): ")

# # Main block where the user provides input and output paths
# if __name__ == "__main__":

#     # Load all JSON files from the input folder
#     json_files = load_json_files(input_folder)

#     # Merge the files using the provided annotator mapping
#     merged_data = merge_json_files(json_files, annotator_mapping)

#     # Save the merged data to the specified output file
#     with open(output_file, "w", encoding="utf-8") as out_file:
#         json.dump(merged_data, out_file, indent=4, ensure_ascii=False)

#     print(f"Merged JSON saved as {output_file}")

In [7]:
import json
import glob
import os

# Function to load all JSON files from a folder
def load_json_files(folder_path):
    return glob.glob(os.path.join(folder_path, "*.json"))

# Function to merge multiple JSON files
def merge_json_files(json_files, annotator_mapping, merged_name="merged annotations", merged_desc="Merged annotations"):
    merged_data = None
    
    # Loop through each file to merge
    for file in json_files:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Apply annotator mapping (if any)
        for doc in data['documents']:
            for assignment in doc['assignments']:
                if assignment['annotator'] in annotator_mapping:
                    assignment['annotator'] = annotator_mapping[assignment['annotator']]

        # Initialize merged_data with the first file
        if merged_data is None:
            merged_data = data
        else:
            merged_data = merge_annotations(merged_data, data)

    # Update merged metadata (name and desc)
    merged_data['name'] = merged_name
    merged_data['desc'] = merged_desc

    return merged_data

# Function to merge document annotations
def merge_annotations(data1, data2):
    merged_documents = {doc['name']: doc for doc in data1['documents']}

    for doc in data2['documents']:
        if doc['name'] in merged_documents:
            # Merge annotations of the same document
            merged_doc = merged_documents[doc['name']]
            merged_doc['assignments'].extend(doc['assignments'])
        else:
            # Add new document
            merged_documents[doc['name']] = doc

    data1['documents'] = list(merged_documents.values())

    # Update counts section based on merged documents
    data1['counts']['documents'] = len(data1['documents'])
    data1['counts']['assignments'] = sum(len(doc['assignments']) for doc in data1['documents'])
    data1['counts']['annotators'] = len(set(a['annotator'] for doc in data1['documents'] for a in doc['assignments']))
    data1['counts']['annotations'] = sum(len(a['annotations']) for doc in data1['documents'] for a in doc['assignments'])
    data1['counts']['relations'] = sum(len(a.get('relations', [])) for doc in data1['documents'] for a in doc['assignments'])

    return data1


In [8]:
if __name__ == "__main__":

    # Load all JSON files from the input folder
    json_files = load_json_files(input_folder)

    # Merge the files using the provided annotator mapping
    merged_data = merge_json_files(json_files, annotator_mapping)

    # Save the merged data to the specified output file
    with open(output_file, "w", encoding="utf-8") as out_file:
        json.dump(merged_data, out_file, indent=4, ensure_ascii=False)

    print(f"Merged JSON saved as {output_file}")

Merged JSON saved as ./output/merged_annotations.json


In [9]:
# With LLM annotations, non-integer confidence scores may be assigned (e.g., 0.8). This causes issues when importing the data in Lawnotation.

# Load the merged JSON file
with open(output_file, "r", encoding="utf-8") as f:
    merged_data = json.load(f)

# Update confidence scores that are not 0, 1, 2, 3, 4, or 5
valid_confidence_scores = {0, 1, 2, 3, 4, 5}

for doc in merged_data["documents"]:
    for assignment in doc["assignments"]:
        for annotation in assignment["annotations"]:
            if annotation["confidence_rating"] not in valid_confidence_scores:
                annotation["confidence_rating"] = 0  # Set invalid scores to 0

# Save the updated JSON file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Updated confidence scores and saved to {output_file}")

Updated confidence scores and saved to ./output/merged_annotations.json


In [10]:
from adapters import Gpt2LawnotationAll

# Function to clean unnecessary spaces in labels
def clean_labels(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Remove spaces in 'name' under 'labels'
    for label in data['labelset']['labels']:
        label['name'] = label['name'].strip()

    # Remove spaces in 'label' under 'annotations'
    # Extend annotations with correct indices using adapters
    order = 1
    for doc in data['documents']:
        for assignment in doc['assignments']:
            adapter = Gpt2LawnotationAll(doc['full_text'])
            converted_annotations, repetitions = adapter.convert(assignment['annotations'])
            assignment['annotations'] = converted_annotations
            assignment['repetitions'] = repetitions
            assignment['order'] = order
            order += 1
            for annotation in assignment['annotations']:
                annotation['label'] = annotation['label'].strip()

    # Save the cleaned data back to the same file
    with open(file_path, 'w', encoding='utf-8') as out_file:
        json.dump(data, out_file, indent=4, ensure_ascii=False)

    print(f"File {file_path} has been cleaned and saved.")

clean_labels(output_file)

File ./output/merged_annotations.json has been cleaned and saved.
