# Correct and vizualize new annotations üõ†Ô∏è

### üéØ Objectives 

- Select ‚Äúinteresting‚Äù annotations containing at least one entity from each class as a small subset of the molecular dynamics text dataset.
- Manually review and correct them by Essmay and Pierre.
- Inspect annotations with 0 entities to ensure nothing was missed.

-------------------------------
## Package version

In [68]:
%load_ext watermark
%watermark
%watermark --packages json,pandas,spacy

Last updated: 2025-11-25T14:51:22.530582+01:00

Python implementation: CPython
Python version       : 3.13.7
IPython version      : 8.13.2

Compiler    : GCC 14.3.0
OS          : Linux
Release     : 6.14.0-35-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 32
Architecture: 64bit

json  : 2.0.9
pandas: 2.2.3
spacy : 3.8.7



----------------

In [None]:
# Import libraries
import json
import pandas as pd

from spacy import displacy

In [None]:
# Constants
TSV_PATH = "../results/all_annotations_entities_count.tsv"
ANNOT_FOLDER = "../annotations/v2"

In [8]:
# Load entities count
df = pd.read_csv(TSV_PATH, sep="\t")
df

Unnamed: 0,filename,length,NB_TEMP,NB_SOFTNAME,NB_SOFTVERS,NB_STIME,NB_MOL,NB_FFM
0,figshare_14511885.json,2075,0,3,0,1,17,0
1,figshare_5642866.json,1280,0,0,0,2,10,0
2,figshare_8292209.json,1114,0,0,0,0,7,0
3,zenodo_3248612.json,576,3,1,1,1,14,2
4,zenodo_6980700.json,639,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...
375,figshare_1586671.json,2267,0,0,0,0,3,0
376,figshare_12661589.json,2139,2,0,0,0,17,0
377,figshare_21285264.json,749,0,0,0,0,8,0
378,figshare_7924394.json,1174,0,0,0,1,8,0


In [9]:
def filter_full_annotations(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter annotations that contain at least one entity in each class.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing entity counts per file.

    Returns
    -------
    pd.DataFrame
        Filtered DataFrame containing only rows where all NB_* > 0.
    """
    cols = [col for col in df.columns if col.startswith("NB_") and not col == "NB_SOFTVERS"]
    filtered_df = df[(df[cols] > 0).all(axis=1)].reset_index(drop=True)
    print(f"Filtered dataset size: {len(filtered_df)} / {len(df)} annotated texts that contains at least one entity of each relevant class !")
    return filtered_df

filtered_df = filter_full_annotations(df)
filtered_df

Filtered dataset size: 43 / 380 annotated texts that contains at least one entity of each relevant class !


Unnamed: 0,filename,length,NB_TEMP,NB_SOFTNAME,NB_SOFTVERS,NB_STIME,NB_MOL,NB_FFM
0,zenodo_3248612.json,576,3,1,1,1,14,2
1,zenodo_838635.json,1624,1,4,1,1,9,5
2,zenodo_14594.json,646,1,2,2,1,4,2
3,zenodo_6755131.json,1691,1,4,0,1,12,3
4,zenodo_15550.json,526,1,2,2,1,4,2
5,zenodo_13853.json,561,1,2,2,1,4,2
6,zenodo_4300706.json,1174,4,1,1,1,7,2
7,zenodo_1118682.json,667,1,2,2,1,9,4
8,zenodo_6349893.json,931,1,3,0,1,2,3
9,zenodo_14591.json,647,1,2,2,1,4,2


Let's vizualize their entities to check any errors to create a good groundtruth dataset of molecular dynamics annotated texts and correct them :)

In [36]:
def remove_entity_annotation_file(file_name: str, entities_to_remove: list) -> None:
    """
    Remove specific entities from a formatted annotation JSON file.

    Parameters
    ----------
    file_name : str
        Name of the JSON file located in the formatted annotations directory.
    entities_to_remove : list
        A list of tuples of the form (label, text) specifying which entities
        should be removed. Example: [("MOL", "water"), ("TEMP", "37¬∞C")]
    """
    file_path = f"{ANNOT_FOLDER}/{file_name}"
    
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        original_count = len(data["entities"])
        
        data["entities"] = [
            ent for ent in data["entities"]
            if (ent["label"], ent["text"]) not in entities_to_remove
        ]        
        removed_count = original_count - len(data["entities"])
        #print(f"{removed_count} entities removed from file : {file_name}")

    out_path = f"{ANNOT_FOLDER}/{file_name}"
    with open(out_path, "w", encoding="utf-8") as file:
       json.dump(data, file, ensure_ascii=False, indent=4)


def find_entity_positions(raw_text: str, entity_text: str) -> list[tuple[int, int]]:
    """Find all occurrences of an entity text inside the raw annotation text.

    This function scans the raw text and returns every (start, end) character
    index pair where the entity text appears. It supports repeated occurrences.

    Parameters
    ----------
    raw_text : str
        The full text in which to search for occurrences.
    entity_text : str
        The substring corresponding to the entity that should be located.

    Returns
    -------
    list[tuple[int, int]]
        A list of (start, end) positions for each occurrence of the entity text.
        Returns an empty list if the text is not found.
    """
    positions = []
    start_idx = 0

    # Search for all occurrences iteratively
    while True:
        start = raw_text.find(entity_text, start_idx)
        if start == -1:
            break  # no more occurrences

        end = start + len(entity_text)
        positions.append((start, end))

        # Move search index forward to avoid infinite loops
        start_idx = end

    return positions


def add_entity_annotation_file(file_name: str, new_entities: list):
    """Add new entities to an existing formatted annotation file.

    This function loads an annotation file, finds all occurrences of new entity
    texts inside the raw text, and appends corresponding entity dictionaries to
    the "entities" list. It supports inserting multiple labels and occurrences
    per label.

    Parameters
    ----------
    file_name : str
        Name of the formatted annotation JSON file.
    new_entities : list
        A list of (label, text) tuples representing the entities to insert.
        Example: [("MOL", "water"), ("TEMP", "37¬∞C")]

    Returns
    -------
    None
        The function updates and rewrites the JSON file.
    """
    file_path = f"{ANNOT_FOLDER}/{file_name}"
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    raw_text = data["raw_text"]
    for label, text in new_entities:
        positions = find_entity_positions(raw_text, text)

        for start, end in positions:
            entity_dict = {
                "label": label,
                "text": text,
                "start": start,
                "end": end
            }

            if entity_dict not in data["entities"]:
                data["entities"].append(entity_dict)

    out_path = f"{ANNOT_FOLDER}/{file_name}"
    with open(out_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def convert_annotations(file_path):
    """
    Convert annotations from the custom format to spaCy's displaCy format.

    Parameters:
        file_path (str): Path to the JSON file containing "classes" and "annotations" keys.

    Returns:
        List[dict]: A list of dictionaries in the spaCy format.
    """
    with open(file_path, "r") as file:
        data = json.load(file)

    converted = []
    ents = [
        {"start": item["start"], "end": item["end"], "label": item["label"]}
        for item in data["entities"]
    ]
    converted.append({"text": data["raw_text"], "ents": ents})
    return converted


def correct_and_vizualize(file_name:str, add_ent: list = None, remove_ent: list = None):
    if add_ent:
        add_entity_annotation_file(file_name, add_ent)
    if remove_ent:
        remove_entity_annotation_file(file_name, remove_ent)

    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#ffffe4",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}
    print("=" * 80)
    print(f"üßê VISUALIZATION OF ENTITIES ({file_name})")
    print("=" * 80)
    path = f"{ANNOT_FOLDER}/{file_name}"
    converted_data = convert_annotations(path)
    displacy.render(converted_data, style="ent", manual=True, options=options)
    print("")

In [37]:
# zenodo_3248612.json
correct_and_vizualize("zenodo_3248612.json", [("TEMP", "310K")], [("TEMP", "310K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_3248612.json)





In [39]:
correct_and_vizualize("zenodo_6755131.json", [("SOFTNAME", "Amber"), ("SOFTVERS", "2018"), ("MOL", "Fe (II)")], [("SOFTNAME", "PLUMED"), ("SOFTNAME", "PROPKA"), ("SOFTNAME", "Robetta"), ("SOFTNAME", "SHAKE"), ("FFM", "Amber2018"), ("MOL", "Fe")])

üßê VISUALIZATION OF ENTITIES (zenodo_6755131.json)





In [40]:
correct_and_vizualize("zenodo_4300706.json", [("TEMP", "358K")], [("TEMP", "358K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_4300706.json)





In [41]:
correct_and_vizualize("zenodo_1118682.json", None, [("FFM", "ECC-ions.")])

üßê VISUALIZATION OF ENTITIES (zenodo_1118682.json)





In [42]:
correct_and_vizualize("zenodo_7323535.json", [("TEMP", "300 K")], [("TEMP", "300 K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_7323535.json)





In [43]:
correct_and_vizualize("figshare_4806544.json", [("MOL", "3g5u" ), ("MOL", "4m1m" ), ("MOL", "4ksb" ), ("MOL", "POPC" ), ("MOL", "CLR" ), ("MOL", "3G5U" ), ("MOL", "4KSB" ), ("MOL", "4M1M" )])

üßê VISUALIZATION OF ENTITIES (figshare_4806544.json)





In [45]:
correct_and_vizualize("zenodo_1293762.json", [("FFM", "43A1-S3")], [("STIME", "100 ns")])

üßê VISUALIZATION OF ENTITIES (zenodo_1293762.json)





In [46]:
correct_and_vizualize("figshare_8046437.json", [("SOFTNAME", "MDAnalysis")], [("SOFTNAME", "LINCS")])

üßê VISUALIZATION OF ENTITIES (figshare_8046437.json)





In [47]:
correct_and_vizualize("zenodo_51754.json", [("TEMP", "323 K" )], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" )])

üßê VISUALIZATION OF ENTITIES (zenodo_51754.json)





In [48]:
correct_and_vizualize("zenodo_5060102.json", [("TEMP", "310 K" )], [("SOFTNAME", "ANI2x" ), ("TEMP", "310 K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_5060102.json)





In [49]:
correct_and_vizualize("zenodo_3950029.json", [("TEMP", "358K")], [("TEMP", "358K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_3950029.json)





In [51]:
correct_and_vizualize("figshare_4757161.json", [("MOL", "hydrogen"), ("SOFTVERS", "16"), ("SOFTNAME", "Amber")], [("SOFTNAME", "SHAKE"), ("SOFTNAME", "Amber16"), ("MOL", "Enzyme")])

üßê VISUALIZATION OF ENTITIES (figshare_4757161.json)





Here Version 16 of Amber is added but it add also 16 in `"with C 16 alkyl chain attached"`.

‚ö†Ô∏è Don't forget to remove this entity in the `figshare_4757161.json`:    

{
    "label": "SOFTVERS",
    "text": "16",
    "start": 1664,
    "end": 1666
},

In [53]:
correct_and_vizualize("zenodo_7007107.json", [("MOL", "phosphoenolpyruvate synthase"), ("MOL", "lysines")], [("STIME", "4 ns"), ("STIME", "80 ns")])

üßê VISUALIZATION OF ENTITIES (zenodo_7007107.json)





In [54]:
correct_and_vizualize("zenodo_1198171.json", [("MOL", "calcium" )], None)

üßê VISUALIZATION OF ENTITIES (zenodo_1198171.json)





In [55]:
correct_and_vizualize("zenodo_3988469.json", [("MOL", "ARB" ), ("SOFTNAME", "AMBER"), ("SOFTVERS", "16")], [("SOFTNAME", "SHAKE"), ("SOFTNAME", "AMBER16")] )

üßê VISUALIZATION OF ENTITIES (zenodo_3988469.json)





In [56]:
correct_and_vizualize("zenodo_53151.json", [("TEMP", "323 K")], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" )])

üßê VISUALIZATION OF ENTITIES (zenodo_53151.json)





In [57]:
correct_and_vizualize("zenodo_51747.json", [("TEMP", "323 K" )], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" )])

üßê VISUALIZATION OF ENTITIES (zenodo_51747.json)





In [58]:
correct_and_vizualize("zenodo_51760.json", [("TEMP", "323 K")], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" )])

üßê VISUALIZATION OF ENTITIES (zenodo_51760.json)





In [59]:
correct_and_vizualize("zenodo_259443.json", [("TEMP", "310 K" ), ("MOL", "CaCl" )], [("TEMP", "310 K."), ("MOL", "CaCl_2" )])

üßê VISUALIZATION OF ENTITIES (zenodo_259443.json)





In [60]:
correct_and_vizualize("zenodo_2653735.json", [("TEMP", "333 K" )], [("TEMP", "333 K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_2653735.json)





In [61]:
correct_and_vizualize("zenodo_53212.json", [("TEMP", "323 K" ), ("STIME", "190 ns")], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" ), ("STIME", "190 ns.")])

üßê VISUALIZATION OF ENTITIES (zenodo_53212.json)





In [62]:
correct_and_vizualize("zenodo_51750.json", [("TEMP", "323 K" ), ("STIME", "110 ns")], [("TEMP", "323 K."), ("SOFTNAME", "LINCS" ), ("SOFTNAME", "SETTLE" ), ("STIME", "110 ns.")])

üßê VISUALIZATION OF ENTITIES (zenodo_51750.json)





In [63]:
correct_and_vizualize("zenodo_1167532.json", [("TEMP", "298 K" )], [("TEMP", "298 K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_1167532.json)





In [64]:
correct_and_vizualize("zenodo_3975394.json", None, None)

üßê VISUALIZATION OF ENTITIES (zenodo_3975394.json)





In [65]:
correct_and_vizualize("zenodo_3975394.json", [("MOL", "DAR"), ("MOL", "darunavir"), ("MOL", "IND"), ("MOL", "indinavir"), ("MOL", "NEL"), ("MOL", "nelfinavir"), ("MOL", "RIT"), ("MOL", "ritonavir"), ("MOL", "SAQ"), ("MOL", "saquinavir"), ("MOL", "TPR"), ("MOL", "tipranavir")], [("TEMP", "298 K.")])

üßê VISUALIZATION OF ENTITIES (zenodo_3975394.json)





We can now also retrieve a few texts with very few entities (i.e., with limited information for the LLM) to observe their behavior and see whether they tend to invent or hallucinate entities.

In [None]:
def retrieve_zero_entity_texts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Retrieve all texts with zero entities.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing entity counts per file.

    Returns
    -------
    pd.DataFrame
        DataFrame containing only texts with 0 entities.
    """
    # Sum entity counts across all columns starting with "NB_"
    cols = [col for col in df.columns if col.startswith("NB_")]
    df["TOTAL_ENTITIES"] = df[cols].sum(axis=1)
    zero_entity_df = df[df["TOTAL_ENTITIES"] == 0].reset_index(drop=True)
    print(f"Retrieved {len(zero_entity_df)} texts with 0 entities.")

    return zero_entity_df

low_entity_df = retrieve_zero_entity_texts(df)
low_entity_df

Retrieved 20 texts with 0 entities.


Unnamed: 0,filename,length,NB_TEMP,NB_SOFTNAME,NB_SOFTVERS,NB_STIME,NB_MOL,NB_FFM,TOTAL_ENTITIES
0,figshare_2224288.json,1160,0,0,0,0,0,0,0
1,zenodo_3540691.json,648,0,0,0,0,0,0,0
2,figshare_21304804.json,1382,0,0,0,0,0,0,0
3,figshare_7069778.json,1886,0,0,0,0,0,0,0
4,figshare_11356691.json,1304,0,0,0,0,0,0,0
5,figshare_19971077.json,1500,0,0,0,0,0,0,0
6,zenodo_6580992.json,655,0,0,0,0,0,0,0
7,figshare_14669430.json,2015,0,0,0,0,0,0,0
8,figshare_13164486.json,1445,0,0,0,0,0,0,0
9,figshare_9945170.json,1157,0,0,0,0,0,0,0


In [67]:
def vizualize_json_entities(list_file_names:list):
    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#ffffe4",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}

    for file_name in list_file_names:
        print("=" * 80)
        print(f"üßê VISUALIZATION OF ENTITIES ({file_name})")
        print("=" * 80)
        path = f"{ANNOT_FOLDER}/{file_name}"
        converted_data = convert_annotations(path)
        displacy.render(converted_data, style="ent", manual=True, options=options)
        print("")

annotation_file_names = low_entity_df["filename"]
vizualize_json_entities(annotation_file_names)

üßê VISUALIZATION OF ENTITIES (figshare_2224288.json)



üßê VISUALIZATION OF ENTITIES (zenodo_3540691.json)



üßê VISUALIZATION OF ENTITIES (figshare_21304804.json)



üßê VISUALIZATION OF ENTITIES (figshare_7069778.json)



üßê VISUALIZATION OF ENTITIES (figshare_11356691.json)



üßê VISUALIZATION OF ENTITIES (figshare_19971077.json)



üßê VISUALIZATION OF ENTITIES (zenodo_6580992.json)



üßê VISUALIZATION OF ENTITIES (figshare_14669430.json)



üßê VISUALIZATION OF ENTITIES (figshare_13164486.json)



üßê VISUALIZATION OF ENTITIES (figshare_9945170.json)



üßê VISUALIZATION OF ENTITIES (figshare_20118543.json)



üßê VISUALIZATION OF ENTITIES (figshare_21158960.json)



üßê VISUALIZATION OF ENTITIES (figshare_1545562.json)



üßê VISUALIZATION OF ENTITIES (figshare_19775703.json)



üßê VISUALIZATION OF ENTITIES (figshare_10269416.json)



üßê VISUALIZATION OF ENTITIES (figshare_5872566.json)



üßê VISUALIZATION OF ENTITIES (figshare_6494975.json)



üßê VISUALIZATION OF ENTITIES (figshare_15059757.json)



üßê VISUALIZATION OF ENTITIES (figshare_15019653.json)



üßê VISUALIZATION OF ENTITIES (figshare_7882718.json)



