# `**esting NER of MD-related simulation's description with GLiNER** üìë

GLiNER2 unifies Named Entity Recognition, Text Classification, Structured Data Extraction, and Relation Extraction into a single 205M parameter model. It provides efficient CPU-based inference without requiring complex pipelines or external API dependencies.

## Load local model

In [1]:
from gliner2 import GLiNER2

# Load model once, use everywhere
extractor = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
print(extractor)

  from .autonotebook import tqdm as notebook_tqdm


üß† Model Configuration
Encoder model      : microsoft/deberta-v3-base
Counting layer     : count_lstm_v2
Token pooling      : first
GLiNER2(
  (encoder): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128011, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
 

## Setup

In [14]:
import operator

import pandas as pd
from spacy import displacy


def compare_entities(groundtruth: dict, response: dict) -> None:
    """
    Compare groundtruth entities with predicted entities and print a summary table.

    Parameters
    ----------
        groundtruth (dict): Ground truth entities in format {"entities": [{"label":...,
        "text":..., "start":..., "end":...}, ...]}
        response (dict): Predicted entities in format {"entities": {"LABEL": [{"text":
        ..., "confidence":..., "start":..., "end":...}, ...], ...}}
    """
    gt_entities = groundtruth["entities"]
    pred_entities = response["entities"]

    # Flatten predicted entities into a list of dicts
    pred_list = [
        {"label": label, "text": ent["text"], "start": ent["start"], "end": ent["end"]}
        for label, ents in pred_entities.items()
        for ent in ents
    ]

    # Remove duplicates in groundtruth (non-redondant)
    seen = set()
    gt_unique = []
    for e in gt_entities:
        key = (e["label"], e["text"])
        if key not in seen:
            seen.add(key)
            gt_unique.append(e)

    # Sort by label and text
    gt_sorted = sorted(gt_unique, key=operator.itemgetter("label", "text"))
    pred_sorted = sorted(pred_list, key=operator.itemgetter("label", "text"))

    # Build table for groundtruth matches
    table_gt = [
        {
            "label_gt": gt["label"],
            "text_gt": gt["text"],
            "label_predicted": next((p["label"] for p in pred_sorted if p["label"] == gt["label"] and p["text"] == gt["text"]), ""),
            "text_predicted": next((p["text"] for p in pred_sorted if p["label"] == gt["label"] and p["text"] == gt["text"]), ""),
            "true_predicted": "‚úÖ" if any(gt["label"] == p["label"] and gt["text"] == p["text"] for p in pred_sorted) else "‚ùå"
        }
        for gt in gt_sorted
    ]

    # Identify predictions not in groundtruth
    gt_keys = set((gt["label"], gt["text"]) for gt in gt_sorted)
    table_extra = [
        {
            "label_gt": "",
            "text_gt": "",
            "label_predicted": p["label"],
            "text_predicted": p["text"],
            "true_predicted": "‚ùå"
        }
        for p in pred_sorted
        if (p["label"], p["text"]) not in gt_keys
    ]

    # Combine tables: groundtruth first, extras after
    table = table_gt + table_extra

    # Calculate statistics
    total_gt = len(gt_sorted)
    total_pred = len(pred_sorted)
    true_no_pos = sum(1 for row in table_gt if row["true_predicted"] == "‚úÖ")
    true_with_pos = sum(
        1 for gt in gt_sorted
        if any(gt["label"] == p["label"] and gt["text"] == p["text"] and gt["start"] == p["start"] and gt["end"] == p["end"] for p in pred_sorted)
    )

    # Display table
    df = pd.DataFrame(table, columns=["label_gt", "text_gt", "label_predicted", "text_predicted", "true_predicted"])
    print(df.to_markdown(index=False))

    # Display summary
    print("\n--- Summary ---")
    print(f"Total entities in groundtruth: {total_gt}")
    print(f"True predicted / total predicted (ignoring positions): {true_no_pos}/{total_pred} = {true_no_pos/total_pred:.2%}")
    print(f"True predicted / total predicted (with positions): {true_with_pos}/{total_pred} = {true_with_pos/total_pred:.2%}")


def convert_annotations_llm(response: dict, text_to_annotate: str):
    """
    Convert LLM-style or groundtruth entities to spaCy displaCy manual format.

    Parameters
    ----------
    response : dict
        Either LLM response {'entities': {'MOL':[...], ...}} 
        or groundtruth {'entities':[{'label':..., 'text':..., 'start':..., 'end':...}, ...]}
    text_to_annotate : str
        Original text

    Returns
    -------
    List[dict] suitable for displaCy manual rendering
    """
    ents = []

    if "entities" not in response:
        raise ValueError("Response must have key 'entities'")

    entities_data = response["entities"]

    # CASE 1 : LLM-style nested dict {'MOL':[...], 'SOFTNAME':[...], ...}
    if isinstance(entities_data, dict):
        # Track which characters are already part of an entity to avoid overlaps
        consumed = [False] * len(text_to_annotate)
        text_lower = text_to_annotate.lower()

        for label, ents_list in entities_data.items():
            for ent in ents_list:
                span_text = ent["text"]
                span_lower = span_text.lower()
                found_any = False

                # If start/end positions are provided, use them first
                if "start" in ent and "end" in ent:
                    start, end = ent["start"], ent["end"]
                    for i in range(start, end):
                        consumed[i] = True
                    ents.append({"start": start, "end": end, "label": label})
                    found_any = True

                # Then search the text for all other non-overlapping occurrences
                search_pos = 0
                while True:
                    start = text_lower.find(span_lower, search_pos)
                    if start == -1:
                        break
                    end = start + len(span_text)

                    if not any(consumed[start:end]):
                        for i in range(start, end):
                            consumed[i] = True
                        ents.append({"start": start, "end": end, "label": label})
                        found_any = True

                    search_pos = start + 1

                if not found_any:
                    print(f"‚ö†Ô∏è Warning: entity '{span_text}' for label '{label}' not found in text.")

    # CASE 2 : Groundtruth-style list [{'label':..., 'text':..., 'start':..., 'end':...}, ...]
    elif isinstance(entities_data, list):
        for ent in entities_data:
            ents.append({
                "start": ent["start"],
                "end": ent["end"],
                "label": ent["label"]
            })

    else:
        msg = "Unknown entities format"
        raise ValueError(msg)

    return [{"text": text_to_annotate.replace("\n", " "), "ents": ents}]


def visualize_llm_annotation(response: dict, text_to_annotate: str):
    """
    Visualize named entities from LLM or groundtruth annotations using spaCy's displaCy.

    Parameters
    ----------
    response : dict
        Annotated entities (LLM response or groundtruth)
    text_to_annotate : str
        Original text
    """
    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#ffffe4",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}
    print("=" * 80)
    print("üßê VISUALIZATION OF ENTITIES")
    print("=" * 80)

    converted_data = convert_annotations_llm(response, text_to_annotate)
    displacy.render(converted_data, style="ent", manual=True, options=options)
    print()

In [3]:
# Class of entities
entities_class_with_description = {
    "MOL": "Molecule or chemical compound involved in the simulation",
    "SOFTNAME": "Molecular dynamics software used for the simulation",
    "SOFTVERS": "Version of the molecular dynamics software",
    "TEMP": "Simulation temperature, typically expressed in Kelvin or Celcius",
    "FFM": "Force field model used to describe interatomic interactions",
    "STIME": "Total simulation time or duration"
}

## 1. Example from `annotations/v2/zenodo_1198454.json`

In [4]:
# annotations/v2/zenodo_1198454.json
text_to_annotate = """"
Simulation data for CHARMM36 POPC bilayer, 100 lipids/leaflet, 940 mM NaCl, 310K, GROMACS 5.1.4\nSimulations of a POPC bilayer with 940 mM of NaCl. The fifth from the set of 6 simulations. The goal was to study the effect of scaling the CHARMM FF on the ion binding. Done for the NMRlipids project, see https://removed for more information. A POPC bilayer consisting of 200 lipids (100 per leaflet) is simulated in the presence of 940 mM NaCl. The Charmm36 model is employed for lipids, the Charmm compatible variant of the tip3p model for water, and the default Charmm ion parameters (type SOD) for NaCl. NB-Fix used for sodium. The Charmm36 force field parameters were obtained from https://removed The files are in GROMACS format. Trajectory (.xtc) is 370 ns long with data saved every 100 ps. the initial structure (.gro), topology (.top), index file (.ndx), simulation paremeter file (.mdp), binary run input file for GROMACS v. 5.1 > (.tpr) and the energy output file (.edr) are provided.
"""
groundtruth = {
'entities': [{'label': 'FFM', 'text': 'CHARMM36', 'start': 22, 'end': 30},
  {'label': 'MOL', 'text': 'POPC', 'start': 31, 'end': 35},
  {'label': 'MOL', 'text': 'NaCl', 'start': 72, 'end': 76},
  {'label': 'TEMP', 'text': '310K', 'start': 78, 'end': 82},
  {'label': 'SOFTNAME', 'text': 'GROMACS', 'start': 84, 'end': 91},
  {'label': 'SOFTVERS', 'text': '5.1.4', 'start': 92, 'end': 97},
  {'label': 'MOL', 'text': 'POPC', 'start': 115, 'end': 119},
  {'label': 'MOL', 'text': 'NaCl', 'start': 143, 'end': 147},
  {'label': 'FFM', 'text': 'CHARMM', 'start': 238, 'end': 244},
  {'label': 'MOL', 'text': 'POPC', 'start': 344, 'end': 348},
  {'label': 'MOL', 'text': 'NaCl', 'start': 439, 'end': 443},
  {'label': 'FFM', 'text': 'Charmm36', 'start': 449, 'end': 457},
  {'label': 'FFM', 'text': 'Charmm', 'start': 492, 'end': 498},
  {'label': 'FFM', 'text': 'tip3p', 'start': 525, 'end': 530},
  {'label': 'MOL', 'text': 'water', 'start': 541, 'end': 546},
  {'label': 'FFM', 'text': 'Charmm', 'start': 564, 'end': 570},
  {'label': 'MOL', 'text': 'NaCl', 'start': 601, 'end': 605},
  {'label': 'FFM', 'text': 'Charmm36', 'start': 635, 'end': 643},
  {'label': 'SOFTNAME', 'text': 'GROMACS', 'start': 719, 'end': 726},
  {'label': 'STIME', 'text': '370 ns', 'start': 756, 'end': 762},
  {'label': 'SOFTNAME', 'text': 'GROMACS', 'start': 924, 'end': 931},
  {'label': 'SOFTVERS', 'text': 'v. 5.1 >', 'start': 932, 'end': 940}]
}

In [5]:
# Extract entities
response = extractor.extract_entities(
    text_to_annotate,
    entities_class_with_description,
    include_confidence=True,
    include_spans=True
)
response

{'entities': {'MOL': [{'text': 'lipids',
    'confidence': 0.9960448145866394,
    'start': 375,
    'end': 381},
   {'text': 'NaCl',
    'confidence': 0.9771686792373657,
    'start': 143,
    'end': 147},
   {'text': 'sodium',
    'confidence': 0.9296447038650513,
    'start': 623,
    'end': 629},
   {'text': 'water',
    'confidence': 0.9011234641075134,
    'start': 541,
    'end': 546}],
  'SOFTNAME': [{'text': 'Charmm36',
    'confidence': 0.9755001664161682,
    'start': 449,
    'end': 457},
   {'text': 'Charmm',
    'confidence': 0.936134934425354,
    'start': 492,
    'end': 498},
   {'text': 'GROMACS',
    'confidence': 0.5493722558021545,
    'start': 84,
    'end': 91}],
  'SOFTVERS': [{'text': 'GROMACS',
    'confidence': 0.713408350944519,
    'start': 719,
    'end': 726}],
  'TEMP': [{'text': '310K',
    'confidence': 0.9877349734306335,
    'start': 78,
    'end': 82}],
  'FFM': [{'text': 'Charmm',
    'confidence': 0.8914679288864136,
    'start': 492,
    'end': 4

In [6]:
compare_entities(groundtruth, response)

| label_gt   | text_gt   | label_predicted   | text_predicted   | true_predicted   |
|:-----------|:----------|:------------------|:-----------------|:-----------------|
| FFM        | CHARMM    |                   |                  | ‚ùå               |
| FFM        | CHARMM36  | FFM               | CHARMM36         | ‚úÖ               |
| FFM        | Charmm    | FFM               | Charmm           | ‚úÖ               |
| FFM        | Charmm36  |                   |                  | ‚ùå               |
| FFM        | tip3p     |                   |                  | ‚ùå               |
| MOL        | NaCl      | MOL               | NaCl             | ‚úÖ               |
| MOL        | POPC      |                   |                  | ‚ùå               |
| MOL        | water     | MOL               | water            | ‚úÖ               |
| SOFTNAME   | GROMACS   | SOFTNAME          | GROMACS          | ‚úÖ               |
| SOFTVERS   | 5.1.4     |                   |          

In [7]:
print("LLM Response:")
visualize_llm_annotation(response, text_to_annotate)

LLM Response:
üßê VISUALIZATION OF ENTITIES





In [8]:
print("Groundtruth:")
visualize_llm_annotation(groundtruth, text_to_annotate)

Groundtruth:
üßê VISUALIZATION OF ENTITIES





## 2. Example from `annotations/v2/zenodo_4300706.json`

In [9]:
text_to_annotate = """256 DPPC Molecules bilayer in pure Water, simulated at 288K (gel) or 358K (fluid)\nPublication: MLLPA: A Machine Learning-assisted Python module to study phase-specific events in lipid membranes Published on: 08 April 2021 Journal: J Comp Chem, 2021, DOI: 10.1002/jcc.26508 Description: Simulation files used to train our Python module to identify the thermodynamic phase of individual lipid molecules in a bilayer, as well as the simulation files analysed by the machine learning models. More information on the module can be found on its website. The training files are named dppc gel.gro and dppc fluid.gro. They respectively correspond to the final frame of the systems simulated at 288K and 358K. All other files are the files analysed by the module. System composition: DPPC molecules: 256 with 130 atoms each Water molecules: 42,492 with 3 atoms each Simulation box dimensions (approx.): 9 x 9 x 20 nm Simulation details: Software: Gromacs (v. 2020) Forcefield: Charmm36 (v. June 2015) - Water: TIP3P Thermostat: Nose-hoover (0.4ps, 2 groups) Barostat: Parrinello-Rahman semi-isotropic (2.0ps, 1.0 bar on each axis, 4.5e-5 bar-1) Duration: 25 ns (after stabilisation)"""
groundtruth = {
"entities": [
        {
            "label": "MOL",
            "text": "DPPC",
            "start": 4,
            "end": 8
        },
        {
            "label": "MOL",
            "text": "Water",
            "start": 35,
            "end": 40
        },
        {
            "label": "TEMP",
            "text": "288K",
            "start": 55,
            "end": 59
        },
        {
            "label": "TEMP",
            "text": "358K",
            "start": 69,
            "end": 73
        },
        {
            "label": "MOL",
            "text": "dppc",
            "start": 577,
            "end": 581
        },
        {
            "label": "MOL",
            "text": "dppc",
            "start": 594,
            "end": 598
        },
        {
            "label": "TEMP",
            "text": "288K",
            "start": 686,
            "end": 690
        },
        {
            "label": "MOL",
            "text": "DPPC",
            "start": 775,
            "end": 779
        },
        {
            "label": "MOL",
            "text": "Water",
            "start": 815,
            "end": 820
        },
        {
            "label": "SOFTNAME",
            "text": "Gromacs",
            "start": 938,
            "end": 945
        },
        {
            "label": "SOFTVERS",
            "text": "(v. 2020)",
            "start": 946,
            "end": 955
        },
        {
            "label": "FFM",
            "text": "Charmm36 (v. June 2015)",
            "start": 968,
            "end": 991
        },
        {
            "label": "MOL",
            "text": "Water",
            "start": 994,
            "end": 999
        },
        {
            "label": "FFM",
            "text": "TIP3P",
            "start": 1001,
            "end": 1006
        },
        {
            "label": "STIME",
            "text": "25 ns",
            "start": 1146,
            "end": 1151
        },
        {
            "label": "TEMP",
            "text": "358K",
            "start": 695,
            "end": 699
        }
    ]
}

In [10]:
# Extract entities
response = extractor.extract_entities(
    text_to_annotate,
    entities_class_with_description,
    include_confidence=True,
    include_spans=True
)
response

{'entities': {'MOL': [{'text': 'DPPC Molecules',
    'confidence': 0.9976617097854614,
    'start': 4,
    'end': 18},
   {'text': 'Water',
    'confidence': 0.9882898330688477,
    'start': 815,
    'end': 820}],
  'SOFTNAME': [{'text': 'Gromacs',
    'confidence': 0.9731000661849976,
    'start': 938,
    'end': 945}],
  'SOFTVERS': [{'text': 'Gromacs',
    'confidence': 0.7306555509567261,
    'start': 938,
    'end': 945}],
  'TEMP': [{'text': '358K',
    'confidence': 0.9999594688415527,
    'start': 695,
    'end': 699},
   {'text': '288K',
    'confidence': 0.999864935874939,
    'start': 686,
    'end': 690}],
  'FFM': [{'text': 'Charmm36',
    'confidence': 0.9886055588722229,
    'start': 968,
    'end': 976}],
  'STIME': [{'text': '25 ns',
    'confidence': 0.9996438026428223,
    'start': 1146,
    'end': 1151}]}}

In [11]:
compare_entities(groundtruth, response)

| label_gt   | text_gt                 | label_predicted   | text_predicted   | true_predicted   |
|:-----------|:------------------------|:------------------|:-----------------|:-----------------|
| FFM        | Charmm36 (v. June 2015) |                   |                  | ‚ùå               |
| FFM        | TIP3P                   |                   |                  | ‚ùå               |
| MOL        | DPPC                    |                   |                  | ‚ùå               |
| MOL        | Water                   | MOL               | Water            | ‚úÖ               |
| MOL        | dppc                    |                   |                  | ‚ùå               |
| SOFTNAME   | Gromacs                 | SOFTNAME          | Gromacs          | ‚úÖ               |
| SOFTVERS   | (v. 2020)               |                   |                  | ‚ùå               |
| STIME      | 25 ns                   | STIME             | 25 ns            | ‚úÖ               |
| 

In [15]:
print("LLM Response:")
visualize_llm_annotation(response, text_to_annotate)

LLM Response:
üßê VISUALIZATION OF ENTITIES





In [16]:
print("Groundtruth:")
visualize_llm_annotation(groundtruth, text_to_annotate)

Groundtruth:
üßê VISUALIZATION OF ENTITIES



