# **DisplaCy visualiser tool for the metadata annotations**

In [2]:
# Imports
import json
from spacy import displacy
import re
import os

Currently the annotations structure is as follows:

```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
    "classes": ["TEMP", "SOFT", "STIME", "MOL", "FFM"],
    "annotations": [
        [
            "text_to_fill_in",
            {
                "entities": [
                    [start_index, end_index, label],
                ]
            }
        ]
    ]
}

# Example
{
    "classes": ["TEMP", "SOFT", "STIME", "MOL", "FFM"],
    "annotations": [
        [
            "POPC Ulmschneider OPLS Verlet Group",
            {
                "entities": [
                    [0, 4, "MOL"],
                    [18, 22, "FFM"]
                ]
            }
        ]
    ]
}

```


However, the spaCy  format for `ent` is as follows:

```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
  "text": "text_to_fill_in.",
  "ents": [{ "start": start_index, "end": end_index, "label": label }]
}

# Example:
{
  "text": "POPC molecules.",
  "ents": [{ "start": 0, "end": 4, "label": "MOL" }]
}

```
This is what it looks like working with the spaCy format:

In [3]:
text = "Marie Curie was born in Warsaw in 1867 and later founded the Radium Institute in Paris"
ents = [
    {"start": 0, "end": 11, "label": "PERS"},
    {"start": 24, "end": 30, "label": "LOC"},
    {"start": 34, "end": 38, "label": "DATE"},
    {"start": 61, "end": 77, "label": "ORG"},
    {"start": 81, "end": 86, "label": "LOC"},
]

data = {"text": text, "ents": ents}

colors = {
    "PERS": "#a8ff65",
    "LOC": "orange",
    "DATE": "turquoise",
    "ORG": "#7361ff",
}
options = {"colors": colors}

displacy.render(data, style="ent", manual=True, options=options)

However we don't have this kind of format. 
We need to therefore convert the ground truth format to something spaCy-compatible.

Let's do this for ground truth texts:

In [4]:
def convert_annotations(file_path):
    """
    Convert annotations from the custom format to spaCy's displaCy format.

    Parameters:
        file_path (str): Path to the JSON file containing "classes" and "annotations" keys.

    Returns:
        List[dict]: A list of dictionaries in the spaCy format.
    """
    with open(file_path, "r") as file:
        data = json.load(file)

    converted = []
    for item in data["annotations"]:
        # Each item is structured as [text, {"entities": [[start, end, label], ...]}]
        text, annotation_info = item
        entities = annotation_info.get("entities", [])
        # Convert each entity to the spaCy dictionary format
        ents = [
            {"start": start, "end": end, "label": label}
            for start, end, label in entities
        ]
        converted.append({"text": text, "ents": ents})
    return converted

In [5]:
# Here we take one example of ground-truth annotation.
data = "../annotations/zenodo_3780463.json"

# Convert the annotations.
converted_data = convert_annotations(data)
original_text = converted_data[0]["text"]

# Print the original text, but keep the format with the \n characters.
# print(repr(original_text))

colors = {
    "TEMP": "#ffb3ba",
    "SOFT": "#ffffba",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

Let's say we have a ground-truth annotation that we want to use as an example in a prompt.
We'll first display the input text so that we can read it properly, then we use spaCy to visualise the ground-truth annotation, and lastly, we modify the annotation format in order to get a text that we can use in prompts.

In this following case, we want the ground-truth annotation modified to an annotation using XML-style tags.

In [6]:
def modify_annotation(text_path: str):
    # Convert the annotations.
    converted_data = convert_annotations(data)
    original_text = converted_data[0]["text"]

    # Print the original text, but keep the format with the \n characters.
    print(repr(original_text), "\n")

    colors = {
        "TEMP": "#ffb3ba",
        "SOFT": "#ffffba",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}

    # You now have a list of annotation dictionaries. For visualization with displaCy,
    # you can serve a single annotated text or multiple.
    displacy.render(converted_data, style="ent", manual=True, options=options)

    # 1. Sort entities by start offset (ascending)
    ents = sorted(converted_data[0]["ents"], key=lambda e: e["start"])
    pieces = []
    last_idx = 0

    # 2. For each entity, append:
    #    – the text from the end of the previous entity (or start of string)
    #    – the wrapped entity
    for e in ents:
        start, end, label = e["start"], e["end"], e["label"]
        # plain text before this entity
        pieces.append(original_text[last_idx:start])
        # the entity wrapped in tags
        entity_text = original_text[start:end]
        pieces.append(f"<{label}>{entity_text}</{label}>")
        last_idx = end

    # 3. Append any trailing text after the last entity
    pieces.append(original_text[last_idx:])
    return repr("".join(pieces))

In [16]:
data = "../annotations/zenodo_4300706.json"

# Top we will have the input text
# Middle we will have the spaCy visualisation
# Last we will have the modified annotation using XML-style tags
print(modify_annotation(data))

'256 DPPC Molecules bilayer in pure Water, simulated at 288K (gel) or 358K (fluid)\nPublication: MLLPA: A Machine Learning-assisted Python module to study phase-specific events in lipid membranes Published on: 08 April 2021 Journal: J Comp Chem, 2021, DOI: 10.1002/jcc.26508 Description: Simulation files used to train our Python module to identify the thermodynamic phase of individual lipid molecules in a bilayer, as well as the simulation files analysed by the machine learning models. More information on the module can be found on its website. The training files are named dppc gel.gro and dppc fluid.gro. They respectively correspond to the final frame of the systems simulated at 288K and 358K. All other files are the files analysed by the module. System composition: DPPC molecules: 256 with 130 atoms each Water molecules: 42,492 with 3 atoms each Simulation box dimensions (approx.): 9 x 9 x 20 nm Simulation details: Software: Gromacs (v. 2020) Forcefield: Charmm36 (v. June 2015) - Wate

'256 <MOL>DPPC</MOL> Molecules bilayer in pure <MOL>Water</MOL>, simulated at <TEMP>288K</TEMP> (gel) or <TEMP>358K</TEMP> (fluid)\nPublication: MLLPA: A Machine Learning-assisted Python module to study phase-specific events in lipid membranes Published on: 08 April 2021 Journal: J Comp Chem, 2021, DOI: 10.1002/jcc.26508 Description: Simulation files used to train our Python module to identify the thermodynamic phase of individual lipid molecules in a bilayer, as well as the simulation files analysed by the machine learning models. More information on the module can be found on its website. The training files are named <MOL>dppc</MOL> gel.gro and <MOL>dppc</MOL> fluid.gro. They respectively correspond to the final frame of the systems simulated at <TEMP>288K</TEMP> and <TEMP>358K.</TEMP> All other files are the files analysed by the module. System composition: <MOL>DPPC</MOL> molecules: 256 with 130 atoms each <MOL>Water</MOL> molecules: 42,492 with 3 atoms each Simulation box dimensio

### **LLM annotations compared to GT annotations**
---

Here we give the functions to adapt the following formats into the spaCy format:
- XML-style labels
- JSON output
- Index based

In [22]:
def convert_xml_to_spacy_format(annotated_text):
    with open(annotated_text, "r") as file:
        annotated_text = file.read()

    pattern = re.compile(r"<(MOL|STIME|TEMP|SOFTNAME|SOFTVERS|FFM)>(.+?)<\1>")
    ents = []
    clean_text = ""
    last_idx = 0  # Position in the annotated text

    for match in pattern.finditer(annotated_text):
        tag = match.group(1)
        entity_text = match.group(2)

        # Text between last entity and current
        pre_entity_text = annotated_text[last_idx : match.start()]
        clean_text += pre_entity_text
        entity_start = len(clean_text)
        clean_text += entity_text
        entity_end = len(clean_text)

        ents.append({"start": entity_start, "end": entity_end, "label": tag})

        last_idx = match.end()

    # Add any remaining text after last tag
    clean_text += annotated_text[last_idx:]

    return [{"text": clean_text, "ents": ents}]

In [26]:
def visualize_xml_vs_gt(data: list) -> None:
    """
    Visualize the annotated text using displaCy.

    Parameters:
        data (list): List of paths to the annotated text files.
    """
    # Convert the annotations.
    converted_real_data = convert_annotations(data[0])  # Real annotation
    converted_ai_data = convert_xml_to_spacy_format(data[1])  # AI annotated

    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#orange",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}

    print("\nREAL ANNOTATION:")
    displacy.render(converted_real_data, style="ent", manual=True, options=options)
    print("\AI XML ANNOTATION:")
    displacy.render(converted_ai_data, style="ent", manual=True, options=options)

In [None]:
# ai_path = "../llm_outputs/annotations_2025-04-22_17-31-26/few_shot/gpt-4.1-2025-04-14/zenodo_7007107.json"
# ai_path = "../llm_outputs/annotations_2025-04-22_17-31-26/few_shot/gpt-4.1-2025-04-14/zenodo_1219494.json"
# ai_path = "../llm_outputs/annotations_2025-04-22_17-31-26/one_shot/gpt-4.1-2025-04-14/zenodo_7323535.json"
# ai_path = "../output_llm_annotations/zero_shot_prompt_template/deepseek-r1-distill-llama-70b/figshare_20300547.json"

# zenodo_6478270
# zenodo_3778112


# o3-2025-04-16
# o3-mini-2025-01-31
# o4-mini-2025-04-16
ai_path = "../llm_outputs/2025-04-28_17-32-24/annotations/few_shot/2025-05-27_14-50-19/zenodo_3780463.json"
real_path = "../annotations/zenodo_3780463.json"

# ai_path = "../testing/ai_text.json"
# real_path = "../testing/real_text.json"

converted_real_data = convert_annotations(real_path)  # Real annotation


def convert_ai_xml(annotated_text):
    with open(annotated_text, "r") as file:
        annotated_text = json.load(file)

    annotated_text = annotated_text["response"]

    pattern = re.compile(r"<(MOL|STIME|TEMP|SOFTNAME|SOFTVERS|FFM)>(.+?)</\1>")
    ents = []
    clean_text = ""
    last_idx = 0  # Position in the annotated text

    for match in pattern.finditer(annotated_text):
        tag = match.group(1)
        entity_text = match.group(2)

        # Text between last entity and current
        pre_entity_text = annotated_text[last_idx : match.start()]
        clean_text += pre_entity_text
        entity_start = len(clean_text)
        clean_text += entity_text
        entity_end = len(clean_text)

        ents.append({"start": entity_start, "end": entity_end, "label": tag})

        last_idx = match.end()

    # Add any remaining text after last tag
    clean_text += annotated_text[last_idx:]
    ents = []
    return [{"text": clean_text, "ents": ents}]


converted_ai_text = convert_ai_xml(ai_path)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

displacy.render(converted_real_data, style="ent", manual=True, options=options)

displacy.render(converted_ai_text, style="ent", manual=True, options=options)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [19/Jun/2025 13:49:17] "GET / HTTP/1.1" 200 662


Shutting down server on port 5000.


In [None]:
real_data = "../annotations/zenodo_3950029.json"
xml_ai_data = "../ai_annotations/chatgpt_zenodo_3950029.txt"

data = [real_data, xml_ai_data]

visualize_xml_vs_gt(data)


REAL ANNOTATION:



AI ANNOTATION:


### **REAL ANNOTATION**

In [None]:
real_data = "../annotations/zenodo_1346073.json"

# Convert the annotations.
converted_data = convert_annotations(real_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

### **A.I. ANNOTATIONS**

In [None]:
ai_data = "../ai_annotations/llama_3_1_b_annotation.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


### **All ground-truth annotations visualised using spaCy**
---

In [7]:
folder = "../annotations"

for filename in os.listdir(folder):
    # if ends with json and contains one '_' only
    if filename.endswith(".json") and filename.count("_") == 1:
        file_path = os.path.join(folder, filename)
        converted_data = convert_annotations(file_path)
        colors = {
            "TEMP": "#ffb3ba",
            "SOFTNAME": "#ffffba",
            "SOFTVERS": "#orange",
            "STIME": "#baffc9",
            "MOL": "#bae1ff",
            "FFM": "#cdb4db",
        }
        options = {"colors": colors}
        print(f"Visualizing {filename}...")
        displacy.render(converted_data, style="ent", manual=True, options=options)
        print("\n=================================================================\n")

Visualizing figshare_22213635.json...




Visualizing figshare_4757161.json...




Visualizing figshare_21263177.json...




Visualizing zenodo_6582985.json...




Visualizing zenodo_6478270.json...




Visualizing figshare_7783568.json...




Visualizing figshare_20300547.json...




Visualizing zenodo_4805388.json...




Visualizing zenodo_51754.json...




Visualizing figshare_20009556.json...




Visualizing figshare_13499346.json...




Visualizing figshare_2178850.json...




Visualizing zenodo_44660.json...




Visualizing zenodo_1219494.json...




Visualizing figshare_19082291.json...




Visualizing zenodo_2644158.json...




Visualizing figshare_2414671.json...




Visualizing figshare_14156828.json...




Visualizing zenodo_53887.json...




Visualizing zenodo_7037843.json...




Visualizing zenodo_5784447.json...




Visualizing zenodo_5680225.json...




Visualizing figshare_8046437.json...




Visualizing figshare_21528986.json...




Visualizing zenodo_7007107.json...




Visualizing figshare_14511885.json...




Visualizing zenodo_4106413.json...




Visualizing figshare_11475366.json...




Visualizing zenodo_1293813.json...




Visualizing figshare_4748350.json...




Visualizing zenodo_7323535.json...




Visualizing zenodo_3779223.json...




Visualizing figshare_21304804.json...




Visualizing figshare_14994624.json...




Visualizing zenodo_6755131.json...




Visualizing figshare_16712908.json...




Visualizing figshare_21806690.json...




Visualizing figshare_8009588.json...




Visualizing zenodo_3780463.json...




Visualizing figshare_2527738.json...




Visualizing figshare_12374771.json...




Visualizing figshare_16715821.json...




Visualizing figshare_13234529.json...




Visualizing figshare_3457004.json...




Visualizing figshare_5106286.json...




Visualizing zenodo_3667662.json...




Visualizing figshare_10269416.json...




Visualizing figshare_9684887.json...




Visualizing zenodo_3592499.json...




Visualizing figshare_5508259.json...




Visualizing figshare_21980305.json...




Visualizing figshare_5872566.json...




Visualizing zenodo_7234728.json...




Visualizing figshare_2032860.json...




Visualizing figshare_13836577.json...




Visualizing figshare_11356691.json...




Visualizing figshare_16698491.json...




Visualizing zenodo_4730896.json...




Visualizing zenodo_13814.json...




Visualizing figshare_2106349.json...




Visualizing zenodo_6010416.json...




Visualizing figshare_2020311.json...




Visualizing zenodo_838635.json...




Visualizing zenodo_1346073.json...




Visualizing zenodo_3228177.json...




Visualizing figshare_13022698.json...




Visualizing figshare_17430576.json...




Visualizing figshare_20485059.json...




Visualizing zenodo_3236015.json...




Visualizing figshare_20334621.json...




Visualizing figshare_3556404.json...




Visualizing figshare_13116890.json...




Visualizing figshare_12046578.json...




Visualizing figshare_20473913.json...




Visualizing zenodo_53151.json...




Visualizing zenodo_6974777.json...




Visualizing zenodo_55565.json...




Visualizing zenodo_6456372.json...




Visualizing figshare_2154835.json...




Visualizing zenodo_4625961.json...




Visualizing figshare_14932766.json...




Visualizing figshare_20485188.json...




Visualizing figshare_19115904.json...




Visualizing figshare_11704443.json...




Visualizing zenodo_5642866.json...




Visualizing figshare_20477204.json...




Visualizing figshare_14919822.json...




Visualizing zenodo_1034816.json...




Visualizing zenodo_6144286.json...




Visualizing figshare_19533246.json...




Visualizing figshare_22151355.json...




Visualizing zenodo_6381427.json...




Visualizing figshare_5532214.json...




Visualizing zenodo_4646862.json...




Visualizing figshare_5975071.json...




Visualizing figshare_19858240.json...




Visualizing figshare_2182180.json...




Visualizing figshare_20118543.json...




Visualizing figshare_9985349.json...




Visualizing zenodo_3865919.json...




Visualizing figshare_1586671.json...




Visualizing figshare_7073366.json...




Visualizing figshare_14602641.json...




Visualizing figshare_13416098.json...




Visualizing zenodo_3522090.json...




Visualizing zenodo_6580992.json...




Visualizing figshare_12272015.json...




Visualizing figshare_9945170.json...




Visualizing figshare_9788852.json...




Visualizing figshare_12218135.json...




Visualizing zenodo_5357152.json...




Visualizing figshare_7924394.json...




Visualizing zenodo_4522359.json...




Visualizing zenodo_6503359.json...




Visualizing zenodo_51760.json...




Visualizing zenodo_3901180.json...




Visualizing zenodo_3977034.json...




Visualizing figshare_2224288.json...




Visualizing zenodo_7301759.json...




Visualizing zenodo_5772140.json...




Visualizing zenodo_3885771.json...




Visualizing figshare_3120376.json...




Visualizing figshare_7998374.json...




Visualizing zenodo_5226209.json...




Visualizing figshare_3983526.json...




Visualizing figshare_20485017.json...




Visualizing zenodo_7305466.json...




Visualizing zenodo_51750.json...




Visualizing zenodo_4196842.json...




Visualizing zenodo_4971801.json...




Visualizing figshare_7865144.json...




Visualizing zenodo_3688506.json...




Visualizing figshare_21263347.json...




Visualizing figshare_21285264.json...




Visualizing zenodo_7107608.json...




Visualizing zenodo_6980700.json...




Visualizing figshare_12168798.json...




Visualizing zenodo_4012224.json...




Visualizing figshare_8292209.json...




Visualizing figshare_6494975.json...




Visualizing zenodo_5021365.json...




Visualizing zenodo_3665677.json...




Visualizing zenodo_6879091.json...




Visualizing figshare_14498668.json...




Visualizing zenodo_5025392.json...




Visualizing osf_a43z2.json...




Visualizing zenodo_5079480.json...




Visualizing figshare_11808522.json...




Visualizing figshare_800228.json...




Visualizing zenodo_51185.json...




Visualizing zenodo_6349893.json...




Visualizing zenodo_6521230.json...




Visualizing figshare_11808396.json...




Visualizing figshare_9956084.json...




Visualizing figshare_4541035.json...




Visualizing figshare_5247880.json...




Visualizing zenodo_1009607.json...




Visualizing figshare_3426170.json...




Visualizing zenodo_6541983.json...




Visualizing figshare_19775703.json...




Visualizing figshare_5190502.json...




Visualizing zenodo_7013022.json...




Visualizing zenodo_6791151.json...




Visualizing zenodo_5592299.json...




Visualizing zenodo_13853.json...




Visualizing zenodo_6970327.json...




Visualizing zenodo_3778112.json...




Visualizing zenodo_34415.json...




Visualizing figshare_3494177.json...




Visualizing zenodo_5570754.json...




Visualizing figshare_9744458.json...




Visualizing zenodo_3950029.json...




Visualizing figshare_2470129.json...




Visualizing figshare_17698630.json...




Visualizing zenodo_4993181.json...




Visualizing zenodo_6817824.json...




Visualizing figshare_5642866.json...




Visualizing zenodo_44622.json...




Visualizing figshare_6960098.json...




Visualizing zenodo_7112198.json...




Visualizing zenodo_3786821.json...




Visualizing figshare_15019653.json...




Visualizing figshare_11771604.json...




Visualizing zenodo_4300706.json...




Visualizing figshare_6394133.json...




Visualizing zenodo_4245236.json...




Visualizing figshare_21217898.json...




Visualizing figshare_12661589.json...




Visualizing zenodo_4336730.json...




Visualizing figshare_21726720.json...




Visualizing figshare_7291835.json...




Visualizing figshare_2224255.json...




Visualizing figshare_7952567.json...




Visualizing zenodo_14592.json...




Visualizing figshare_15054454.json...




Visualizing figshare_16604601.json...




Visualizing zenodo_5756605.json...




Visualizing zenodo_14591.json...




Visualizing zenodo_3361945.json...




Visualizing figshare_14428978.json...




Visualizing figshare_5875734.json...




Visualizing figshare_19102457.json...




Visualizing zenodo_7533587.json...




Visualizing figshare_19673947.json...




Visualizing zenodo_6797842.json...




Visualizing zenodo_6380887.json...




Visualizing zenodo_7192724.json...




Visualizing figshare_20017637.json...




Visualizing figshare_19657472.json...




Visualizing figshare_21940502.json...




Visualizing zenodo_3362889.json...




Visualizing zenodo_3975394.json...




Visualizing figshare_19534516.json...




Visualizing figshare_16410502.json...




Visualizing figshare_13387534.json...




Visualizing zenodo_6868243.json...




Visualizing zenodo_1144128.json...




Visualizing zenodo_3813275.json...




Visualizing figshare_7279100.json...




Visualizing figshare_13352375.json...




Visualizing figshare_4806544.json...




Visualizing figshare_19652739.json...




Visualizing figshare_12986024.json...




Visualizing zenodo_167336.json...




Visualizing figshare_11764158.json...




Visualizing figshare_2134639.json...




Visualizing figshare_14791452.json...




Visualizing figshare_19656195.json...




Visualizing figshare_10287393.json...




Visualizing figshare_8258741.json...




Visualizing figshare_20812257.json...




Visualizing zenodo_6498021.json...




Visualizing figshare_19405967.json...




Visualizing figshare_6965759.json...




Visualizing zenodo_7523635.json...




Visualizing zenodo_2653735.json...




Visualizing zenodo_5018433.json...




Visualizing figshare_19621175.json...




Visualizing figshare_17021821.json...




Visualizing figshare_19971077.json...




Visualizing zenodo_5060102.json...




Visualizing zenodo_7672195.json...




Visualizing figshare_14766557.json...




Visualizing zenodo_4445375.json...




Visualizing figshare_22203473.json...




Visualizing figshare_1146536.json...




Visualizing figshare_2257141.json...




Visualizing figshare_2332498.json...




Visualizing zenodo_1009027.json...




Visualizing figshare_20376757.json...




Visualizing zenodo_7066551.json...




Visualizing zenodo_2586559.json...




Visualizing figshare_9941741.json...




Visualizing figshare_19949518.json...




Visualizing figshare_6955175.json...




Visualizing figshare_21568583.json...




Visualizing zenodo_6616150.json...




Visualizing figshare_21806756.json...




Visualizing zenodo_247386.json...




Visualizing zenodo_3613573.json...




Visualizing zenodo_3634769.json...




Visualizing figshare_2615170.json...




Visualizing figshare_11343020.json...




Visualizing figshare_20359967.json...




Visualizing figshare_3940377.json...




Visualizing zenodo_6957888.json...




Visualizing zenodo_30894.json...




Visualizing figshare_13135976.json...




Visualizing figshare_2481151.json...




Visualizing figshare_2271034.json...




Visualizing figshare_2227162.json...




Visualizing figshare_7586417.json...




Visualizing zenodo_5362218.json...




Visualizing figshare_13378605.json...




Visualizing figshare_11808321.json...




Visualizing zenodo_5912821.json...




Visualizing zenodo_15550.json...




Visualizing zenodo_3760965.json...




Visualizing zenodo_57131.json...




Visualizing zenodo_14594.json...




Visualizing figshare_21175362.json...




Visualizing figshare_7370684.json...




Visualizing figshare_13301090.json...




Visualizing figshare_4743637.json...




Visualizing figshare_11697906.json...




Visualizing figshare_5651101.json...




Visualizing figshare_16772812.json...




Visualizing figshare_9905063.json...




Visualizing figshare_12064266.json...




Visualizing figshare_14669430.json...




Visualizing zenodo_3780206.json...




Visualizing figshare_9985175.json...




Visualizing zenodo_7024651.json...




Visualizing zenodo_5573728.json...




Visualizing zenodo_5153261.json...




Visualizing zenodo_3623150.json...




Visualizing figshare_7607222.json...




Visualizing zenodo_4673703.json...




Visualizing figshare_7857659.json...




Visualizing zenodo_6870476.json...




Visualizing figshare_11569452.json...




Visualizing zenodo_7572990.json...




Visualizing zenodo_51748.json...




Visualizing figshare_7691966.json...




Visualizing zenodo_2542912.json...




Visualizing figshare_1545562.json...




Visualizing figshare_1381865.json...




Visualizing zenodo_3988469.json...




Visualizing zenodo_7273800.json...




Visualizing figshare_6874760.json...




Visualizing figshare_6272471.json...




Visualizing figshare_11688372.json...




Visualizing zenodo_6592231.json...




Visualizing zenodo_1198171.json...




Visualizing zenodo_6973476.json...




Visualizing zenodo_1198158.json...




Visualizing figshare_7069778.json...




Visualizing figshare_6270413.json...




Visualizing figshare_21158960.json...




Visualizing figshare_3893244.json...




Visualizing zenodo_3951861.json...




Visualizing figshare_15059757.json...




Visualizing zenodo_6988344.json...




Visualizing figshare_18095690.json...




Visualizing zenodo_1488094.json...




Visualizing figshare_2267653.json...




Visualizing zenodo_7440399.json...




Visualizing figshare_5546158.json...




Visualizing figshare_21121417.json...




Visualizing figshare_3470993.json...




Visualizing figshare_2018634.json...




Visualizing zenodo_13392.json...




Visualizing figshare_11569377.json...




Visualizing figshare_8291933.json...




Visualizing figshare_16574000.json...




Visualizing figshare_19780550.json...




Visualizing figshare_6575438.json...




Visualizing figshare_14225919.json...




Visualizing zenodo_4962493.json...




Visualizing zenodo_3248612.json...




Visualizing figshare_7239968.json...




Visualizing figshare_2267512.json...




Visualizing zenodo_1118682.json...




Visualizing zenodo_3610470.json...




Visualizing zenodo_3540691.json...




Visualizing zenodo_3878218.json...




Visualizing zenodo_3908769.json...




Visualizing figshare_17207477.json...




Visualizing zenodo_259443.json...




Visualizing figshare_17319679.json...




Visualizing zenodo_8431.json...




Visualizing figshare_12821617.json...




Visualizing zenodo_3567651.json...




Visualizing zenodo_4451274.json...




Visualizing figshare_20080728.json...




Visualizing zenodo_51747.json...




Visualizing zenodo_53212.json...




Visualizing figshare_2532337.json...




Visualizing zenodo_238943.json...




Visualizing zenodo_7327525.json...




Visualizing zenodo_3862992.json...




Visualizing figshare_121241.json...




Visualizing figshare_13655516.json...




Visualizing figshare_2276194.json...




Visualizing figshare_7882718.json...




Visualizing figshare_3467309.json...




Visualizing zenodo_3696970.json...




Visualizing figshare_3207154.json...




Visualizing zenodo_1198454.json...




Visualizing zenodo_1167532.json...




Visualizing figshare_5496910.json...




Visualizing figshare_2478628.json...




Visualizing figshare_13164486.json...




Visualizing zenodo_1293762.json...




Visualizing figshare_12987701.json...




Visualizing figshare_12312368.json...




Visualizing figshare_12120159.json...




Visualizing zenodo_30904.json...




Visualizing figshare_9805616.json...




Visualizing osf_82n73.json...






In [13]:
from pathlib import Path
from typing import Dict, Any, List


def convert_label_annotations(file_path: str | Path) -> Dict[str, Any]:
    """
    Convert one custom-format JSON file to the structure required by
    spaCy's displaCy manual visualiser.

    Returns
    -------
    {
        "text": "<full text>",
        "ents": [ {"start": 0, "end": 5, "label": "TAG"}, ... ]
    }
    """
    file_path = Path(file_path)

    with file_path.open(encoding="utf-8") as f:
        data = json.load(f)

    ents: List[Dict[str, Any]] = [
        {"start": int(s), "end": int(e), "label": tag} for s, e, tag in data["label"]
    ]

    return {"text": data["text"], "ents": ents}

In [17]:
original_annotations = "../annotations/"
converted_annotations = "../testing/annotations_attempts/"

for filename in os.listdir(original_annotations):
    # if ends with json and contains one '_' only
    if filename.endswith(".json") and filename.count("_") == 1:
        file_path = os.path.join(original_annotations, filename)
        converted_file_path = os.path.join(converted_annotations, filename)

        converted_original_data = convert_annotations(file_path)
        converted_label_data = convert_label_annotations(converted_file_path)

        colors = {
            "TEMP": "#ffb3ba",
            "SOFTNAME": "#ffffba",
            "SOFTVERS": "#orange",
            "STIME": "#baffc9",
            "MOL": "#bae1ff",
            "FFM": "#cdb4db",
        }
        options = {"colors": colors}
        displacy.render(
            converted_original_data, style="ent", manual=True, options=options
        )
        print("----")
        displacy.render(converted_label_data, style="ent", manual=True, options=options)
        print("\n=================================================================\n")

NameError: name 'os' is not defined