In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/original-lner-dataset/fold2.json
/kaggle/input/original-lner-dataset/fold1.json
/kaggle/input/original-lner-dataset/fold3.json


In [4]:
import json
import os

# Define the label mapping
label_mapping = {
    0: "APP",        1: "RESP",
    2: "A.COUNSEL",  3: "R.COUNSEL",
    4: "JUDGE",      5: "WIT",
    6: "AUTH",       7: "COURT",
    8: "STAT",       9: "PREC",
    10: "DATE",      11: "CASENO"
}

def process_json_file(input_path, output_path):
    """
    Reads a JSON file with multiple documents, converts numeric labels to text,
    extracts entity snippets, and writes the transformed data to a new JSON file.
    """
    with open(input_path, 'r', encoding='utf-8') as f_in:
        documents = json.load(f_in)

    transformed_docs = []

    for doc in documents:
        doc_id = doc['id']
        text = doc['text']
        new_labels = []

        for span in doc['spans']:
            start, end = span['start'], span['end']
            label_num = span['label']
            label_name = label_mapping.get(label_num, str(label_num))
            extracted_text = text[start:end]
            new_labels.append([start, end, label_name, extracted_text])

        new_labels.sort(key=lambda x: x[0])

        transformed_doc = {
            "id": doc_id,
            "text": text,
            "labels": new_labels
        }
        transformed_docs.append(transformed_doc)

    output_data = {
        "version": "1.0.0",
        "data": transformed_docs
    }

    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(output_data, f_out, ensure_ascii=False, indent=4)

    print(f"Processed {len(documents)} documents from {input_path} -> {output_path}")


input_files = [
   "/kaggle/input/original-lner-dataset/fold1.json",
   "/kaggle/input/original-lner-dataset/fold2.json",
   "/kaggle/input/original-lner-dataset/fold3.json"
]

for input_path in input_files:
    base_name = os.path.basename(input_path)
    output_name = f"/kaggle/working/{base_name}"
    process_json_file(input_path=input_path, output_path=output_name)

Processed 35 documents from /kaggle/input/original-lner-dataset/fold1.json -> /kaggle/working/fold1.json
Processed 35 documents from /kaggle/input/original-lner-dataset/fold2.json -> /kaggle/working/fold2.json
Processed 35 documents from /kaggle/input/original-lner-dataset/fold3.json -> /kaggle/working/fold3.json
