# Disclaimer
The NER implementation using GLiNER provided in this notebook is just for educational/demonstrational purposes. 

To see a complete implementation tailored for the GutBrainIE NER task, please refer to the baseline system implementation provided by the organizers:

https://github.com/MMartinelli-hub/GutBrainIE_2025_Baseline

## Setup and Installation

In [None]:
# Install GLiNER if not already installed
# !pip install gliner -q

In [None]:
import json
import os
import numpy as np
from gliner import GLiNER
from gliner2 import GLiNER2
from tqdm import tqdm
from collections import Counter
import pandas as pd

print("Setup complete")
print(f"GLiNER imported successfully")

## Define Entity Labels

GLiNER can work with natural language descriptions of entity types.

In [None]:
# Define entity labels - GLiNER works with natural language descriptions
ENTITY_LABELS = [
    "anatomical location",
    "animal",
    "bacteria",
    "biomedical technique",
    "chemical",
    "disease, disorder or finding",  # DDF
    "dietary supplement",
    "drug",
    "food",
    "gene",
    "human",
    "microbiome",
    "statistical technique"
]

# Map GLiNER output back to original labels
LABEL_MAPPING = {
    "disease, disorder or finding": "DDF"
}

def normalize_label(label):
    """Normalize GLiNER label to match original format."""
    return LABEL_MAPPING.get(label, label)

print(f"Entity labels ({len(ENTITY_LABELS)}):")
for label in ENTITY_LABELS:
    print(f"  - {label}")

## Data Loading Functions

In [None]:
def load_ner_data(file_paths):
    """
    Load NER data from multiple JSON files.
    """
    all_data = {}
    
    for file_path in file_paths:
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            all_data.update(data)
            print(f"Loaded {len(data)} documents from {os.path.basename(file_path)}")
        else:
            print(f"Warning: {file_path} not found")
    
    return all_data


print("✓ Data loading functions defined")

## Load Dev Data

We'll use the dev set for zero-shot evaluation (no training needed for GLiNER).

In [None]:
# Load dev data
dev_data = load_ner_data(["../data/Annotations/Dev/json_format/dev.json"])

print(f"\nTotal dev documents: {len(dev_data)}")

# Count total entities
total_entities = sum(len(article['entities']) for article in dev_data.values())
print(f"Total entities in dev set: {total_entities}")

In [None]:
# Show example document
example_pmid = list(dev_data.keys())[0]
example_article = dev_data[example_pmid]

print(f"Example document (PMID: {example_pmid}):")
print(f"  Title: {example_article['metadata']['title'][:100]}...")
print(f"  Abstract: {example_article['metadata']['abstract'][:150]}...")
print(f"  Number of entities: {len(example_article['entities'])}")
print(f"\nFirst 3 entities:")
for entity in example_article['entities'][:3]:
    print(f"    - '{entity['text_span']}' [{entity['label']}] in {entity['location']}")

## Load GLiNER Models

We'll load both GLiNER v1 and v2 for comparison.

In [None]:
# Load GLiNER v1 (base model)
print("Loading GLiNER v1 (base model)...")
#model_v1_name = "urchade/gliner_base"
model_v1_name = "numind/NuNER_Zero"
model_v1 = GLiNER.from_pretrained(model_v1_name)
print("✓ GLiNER v1 loaded")

print(f"\nModel details:")
print(f"  Model name: {model_v1_name}")
print(f"  Type: Zero-shot NER")

In [None]:
# Load GLiNER v2 (improved model)
print("Loading GLiNER v2 (multi-task model)...")
model_v2_name = "fastino/gliner2-base-v1"
model_v2 = GLiNER2.from_pretrained(model_v2_name)
print("✓ GLiNER v2 loaded")

print(f"\nModel details:")
print(f"  Model name: {model_v2_name}")
print(f"  Type: Zero-shot NER (improved)")

## Test GLiNER on Sample Text

In [None]:
# Test on a sample sentence
sample_text = "The gut microbiome has been shown to have key implications in the pathogenesis of Parkinson's disease."

print("Sample text:")
print(f"  {sample_text}")
print("\n" + "="*60)

# GLiNER v1 predictions
print("\nGLiNER v1 predictions:")
entities_v1 = model_v1.predict_entities(sample_text, ENTITY_LABELS, threshold=0.3)
for entity in entities_v1:
    print(f"  - '{entity['text']}' [{entity['label']}] (score: {entity['score']:.3f})")

print("\n" + "="*60)

# GLiNER v2 predictions
print("\nGLiNER v2 predictions:")
entities_v2 = model_v2.extract_entities(sample_text, ENTITY_LABELS, threshold=0.3)
for label, entities in entities_v2['entities'].items():
    if len(entities) == 0:
        print(f"  - '{label}': []")        
    else:            
        entities_print = ""        
        for ent in entities:
            entities_print += f"'{ent}',"  
        print(f"  - '{label}': {entities_print}")

## Inference Function

In [None]:
def predict_entities_gliner_v1(model, text, labels, threshold=0.3, location="abstract"):
    """
    Predict entities using GLiNER v1 model.
    Returns entities in the format expected by evaluation.
    """
    # Get predictions from GLiNER v1
    entities = model.predict_entities(text, labels, threshold=threshold)
    
    # Convert to expected format
    formatted_entities = []
    for entity in entities:
        formatted_entities.append({
            'start_idx': entity['start'],
            'end_idx': entity['end'],
            'text_span': entity['text'],
            'label': normalize_label(entity['label']),
            'location': location,
            'score': entity['score']
        })
    
    return formatted_entities


def predict_entities_gliner_v2(model, text, labels, threshold=0.3, location="abstract"):
    """
    Predict entities using GLiNER v2 model.
    Returns entities in the format expected by evaluation.
    GLiNER v2 uses extract_entities and returns a dictionary format.
    """    
    # Get predictions from GLiNER v2                
    result = model.extract_entities(text, labels, threshold=threshold)                

    # Convert to expected format                    
    formatted_entities = []                    
    for label, entity_texts in result['entities'].items():                    
        # For each entity text, find its position in the original text                    
        for entity_text in entity_texts:                    
            # Find all occurrences of this entity in the text                    
            start_idx = 0                
            while True:                # Add this entity
                start_idx = text.find(entity_text, start_idx)                
                end_idx = start_idx + len(entity_text)
                if start_idx == -1:                
                    break    
                formatted_entities.append({
                    'start_idx': start_idx,
                    'end_idx': end_idx,
                    'text_span': entity_text,
                    'label': normalize_label(label),
                    'location': location,
                    'score': 1.0  # GLiNER v2 doesn't provide scores in the same way
                })
    return formatted_entities
    
print("✓ Inference functions defined")

## Predict on Dev Set - GLiNER v1

In [None]:
# Run inference with GLiNER v1
print("Running inference with GLiNER v1 on dev set...")
print("Note: This may take several minutes\n")

predictions_v1 = {}
threshold_v1 = 0.3  # Adjust threshold as needed

for pmid, article in tqdm(dev_data.items(), desc="GLiNER v1 inference"):
    all_entities = []
    
    # Process title
    title = article['metadata']['title']
    title_entities = predict_entities_gliner_v1(
        model_v1, title, ENTITY_LABELS, 
        threshold=threshold_v1, location='title'
    )
    all_entities.extend(title_entities)
    
    # Process abstract
    abstract = article['metadata']['abstract']
    abstract_entities = predict_entities_gliner_v1(
        model_v1, abstract, ENTITY_LABELS, 
        threshold=threshold_v1, location='abstract'
    )
    all_entities.extend(abstract_entities)
    
    predictions_v1[pmid] = {'entities': all_entities}

total_entities_v1 = sum(len(p['entities']) for p in predictions_v1.values())
print(f"\n✓ GLiNER v1 inference completed")
print(f"  Total entities predicted: {total_entities_v1}")
print(f"  Threshold used: {threshold_v1}")

## Save GLiNER v1 Predictions

In [None]:
# Save GLiNER v1 predictions
output_path_v1 = "predictions/gliner_v1_predictions_zero_shot.json"

with open(output_path_v1, "w", encoding="utf-8") as f:
    json.dump(predictions_v1, f, ensure_ascii=False, indent=2)

print(f"GLiNER v1 predictions saved to {output_path_v1}")

## Predict on Dev Set - GLiNER v2

In [None]:
# Run inference with GLiNER v2
print("Running inference with GLiNER v2 on dev set...")
print("Note: This may take several minutes\n")

predictions_v2 = {}
threshold_v2 = 0.3  # Adjust threshold as needed

for pmid, article in tqdm(dev_data.items(), desc="GLiNER v2 inference"):
    all_entities = []
    
    # Process title
    title = article['metadata']['title']
    title_entities = predict_entities_gliner_v2(
        model_v2, title, ENTITY_LABELS, 
        threshold=threshold_v2, location='title'
    )
    all_entities.extend(title_entities)
    
    # Process abstract
    abstract = article['metadata']['abstract']
    abstract_entities = predict_entities_gliner_v2(
        model_v2, abstract, ENTITY_LABELS, 
        threshold=threshold_v2, location='abstract'
    )
    all_entities.extend(abstract_entities)
    
    predictions_v2[pmid] = {'entities': all_entities}

total_entities_v2 = sum(len(p['entities']) for p in predictions_v2.values())
print(f"\n✓ GLiNER v2 inference completed")
print(f"  Total entities predicted: {total_entities_v2}")
print(f"  Threshold used: {threshold_v2}")

## Save GLiNER v2 Predictions

In [None]:
# Save GLiNER v2 predictions
output_path_v2 = "predictions/gliner_v2_predictions_zero_shot.json"

with open(output_path_v2, "w", encoding="utf-8") as f:
    json.dump(predictions_v2, f, ensure_ascii=False, indent=2)

print(f"GLiNER v2 predictions saved to {output_path_v2}")

## Evaluation Functions

In [None]:
def remove_duplicated_entities(predictions):
    """Remove duplicated entities from predictions."""
    removed_count = 0
    for pmid in list(predictions.keys()):
        seen = set()
        deduped = []
        for ent in predictions[pmid]["entities"]:
            key = (ent["start_idx"], ent["end_idx"], ent["location"])
            if key not in seen:
                seen.add(key)
                deduped.append(ent)
            else:
                removed_count += 1
        predictions[pmid]["entities"] = deduped
    
    if removed_count > 0:
        print(f"Removed {removed_count} duplicated entities from predictions")

def remove_overlapping_entities_eval(predictions):
    """Remove overlapping entities, keeping longest spans."""
    removed_count = 0

    for pmid in list(predictions.keys()):
        original_len = len(predictions[pmid]['entities'])
        
        groups = {'title': [], 'abstract': []}
        for ent in predictions[pmid]['entities']:
            loc = ent["location"]
            groups[loc].append(ent)

        keepers = set()
        for loc in groups:
            group = groups[loc]
            group = sorted(group, key=lambda e: e["start_idx"])

            clusters = []
            cluster = []
            current_end = None

            for ent in group:
                if not cluster:
                    cluster = [ent]
                    current_end = ent["end_idx"]
                else:
                    if ent["start_idx"] < current_end:
                        cluster.append(ent)
                        if ent["end_idx"] > current_end:
                            current_end = ent["end_idx"]
                    else:
                        clusters.append(cluster)
                        cluster = [ent]
                        current_end = ent["end_idx"]
            if cluster:
                clusters.append(cluster)

            for clust in clusters:
                longest = clust[0]
                max_len = longest["end_idx"] - longest["start_idx"]
                for ent in clust[1:]:
                    length = ent["end_idx"] - ent["start_idx"]
                    if length > max_len:
                        longest = ent
                        max_len = length
                keepers.add((longest["start_idx"],
                             longest["end_idx"],
                             longest["location"]))

        deduped = []
        for ent in predictions[pmid]['entities']:
            key = (ent["start_idx"], ent["end_idx"], ent["location"])
            if key in keepers:
                deduped.append(ent)
                keepers.remove(key)

        predictions[pmid]["entities"] = deduped
        removed_count += (original_len - len(deduped))

    if removed_count > 0:
        print(f"Removed {removed_count} overlapping entities")

def evaluate_ner(predictions, ground_truth):
    """Evaluate NER predictions against ground truth."""
    # Remove duplicated and overlapping entities
    remove_duplicated_entities(predictions)
    remove_overlapping_entities_eval(predictions)
    
    LEGAL_ENTITY_LABELS = [
        "anatomical location", "animal", "bacteria", "biomedical technique",
        "chemical", "DDF", "dietary supplement", "drug", "food", "gene",
        "human", "microbiome", "statistical technique"
    ]
    
    ground_truth_NER = dict()
    count_annotated_entities_per_label = {}
    
    for pmid, article in ground_truth.items():
        if pmid not in ground_truth_NER:
            ground_truth_NER[pmid] = []
        for entity in article['entities']:
            start_idx = int(entity["start_idx"])
            end_idx = int(entity["end_idx"])
            location = str(entity["location"])
            text_span = str(entity["text_span"])
            label = str(entity["label"]) 
            
            entry = (start_idx, end_idx, location, text_span, label)
            ground_truth_NER[pmid].append(entry)
            
            if label not in count_annotated_entities_per_label:
                count_annotated_entities_per_label[label] = 0
            count_annotated_entities_per_label[label] += 1

    count_predicted_entities_per_label = {label: 0 for label in list(count_annotated_entities_per_label.keys())}
    count_true_positives_per_label = {label: 0 for label in list(count_annotated_entities_per_label.keys())}

    for pmid in predictions.keys():
        entities = predictions[pmid]['entities']
        
        for entity in entities:
            start_idx = int(entity["start_idx"])
            end_idx = int(entity["end_idx"])
            location = str(entity["location"])
            text_span = str(entity["text_span"])
            label = str(entity["label"]) 
            
            if label not in LEGAL_ENTITY_LABELS:
                continue

            if label in count_predicted_entities_per_label:
                count_predicted_entities_per_label[label] += 1

            entry = (start_idx, end_idx, location, text_span, label)
            if pmid in ground_truth_NER and entry in ground_truth_NER[pmid]:
                count_true_positives_per_label[label] += 1

    count_annotated_entities = sum(count_annotated_entities_per_label.values())
    count_predicted_entities = sum(count_predicted_entities_per_label.values())
    count_true_positives = sum(count_true_positives_per_label.values())

    micro_precision = count_true_positives / (count_predicted_entities + 1e-10)
    micro_recall = count_true_positives / (count_annotated_entities + 1e-10)
    micro_f1 = 2 * ((micro_precision * micro_recall) / (micro_precision + micro_recall + 1e-10))

    precision, recall, f1 = 0, 0, 0
    n = len(count_annotated_entities_per_label)
    for label in count_annotated_entities_per_label.keys():
        current_precision = count_true_positives_per_label[label] / (count_predicted_entities_per_label[label] + 1e-10) 
        current_recall = count_true_positives_per_label[label] / (count_annotated_entities_per_label[label] + 1e-10) 
        
        precision += current_precision
        recall += current_recall
        f1 += 2 * ((current_precision * current_recall) / (current_precision + current_recall + 1e-10))
    
    precision = precision / n
    recall = recall / n
    f1 = f1 / n

    return precision, recall, f1, micro_precision, micro_recall, micro_f1


print("✓ Evaluation functions defined")

## Evaluate GLiNER v1 Performance

In [None]:
# Evaluate GLiNER v1
import json
print("Evaluating GLiNER v1...\n")
predictions_v1 = json.load(open("predictions/gliner_v1_predictions_zero_shot_eval_format.json", "r", encoding="utf-8"))
precision_v1, recall_v1, f1_v1, micro_precision_v1, micro_recall_v1, micro_f1_v1 = evaluate_ner(
    predictions_v1.copy(), dev_data
)

print("="*60)
print("GLiNER v1 RESULTS")
print("="*60)
print("\nMacro-averaged Metrics:")
print(f"  Macro-Precision: {precision_v1:.4f}")
print(f"  Macro-Recall:    {recall_v1:.4f}")
print(f"  Macro-F1 Score:  {f1_v1:.4f}")

print("\nMicro-averaged Metrics:")
print(f"  Micro-Precision: {micro_precision_v1:.4f}")
print(f"  Micro-Recall:    {micro_recall_v1:.4f}")
print(f"  Micro-F1 Score:  {micro_f1_v1:.4f}")
print("="*60)

## Evaluate GLiNER v2 Performance

In [None]:
# Evaluate GLiNER v2
print("Evaluating GLiNER v2...\n")
precision_v2, recall_v2, f1_v2, micro_precision_v2, micro_recall_v2, micro_f1_v2 = evaluate_ner(
    predictions_v2.copy(), dev_data
)

print("="*60)
print("GLiNER v2 RESULTS")
print("="*60)
print("\nMacro-averaged Metrics:")
print(f"  Macro-Precision: {precision_v2:.4f}")
print(f"  Macro-Recall:    {recall_v2:.4f}")
print(f"  Macro-F1 Score:  {f1_v2:.4f}")

print("\nMicro-averaged Metrics:")
print(f"  Micro-Precision: {micro_precision_v2:.4f}")
print(f"  Micro-Recall:    {micro_recall_v2:.4f}")
print(f"  Micro-F1 Score:  {micro_f1_v2:.4f}")
print("="*60)

## Compare GLiNER v1 vs v2

In [None]:
# Comparison table
comparison_data = {
    'Metric': [
        'Macro-Precision', 'Macro-Recall', 'Macro-F1',
        'Micro-Precision', 'Micro-Recall', 'Micro-F1'
    ],
    'GLiNER v1': [
        f"{precision_v1:.4f}", f"{recall_v1:.4f}", f"{f1_v1:.4f}",
        f"{micro_precision_v1:.4f}", f"{micro_recall_v1:.4f}", f"{micro_f1_v1:.4f}"
    ],
    'GLiNER v2': [
        f"{precision_v2:.4f}", f"{recall_v2:.4f}", f"{f1_v2:.4f}",
        f"{micro_precision_v2:.4f}", f"{micro_recall_v2:.4f}", f"{micro_f1_v2:.4f}"
    ],
    'Improvement': [
        f"{(precision_v2 - precision_v1):.4f}",
        f"{(recall_v2 - recall_v1):.4f}",
        f"{(f1_v2 - f1_v1):.4f}",
        f"{(micro_precision_v2 - micro_precision_v1):.4f}",
        f"{(micro_recall_v2 - micro_recall_v1):.4f}",
        f"{(micro_f1_v2 - micro_f1_v1):.4f}"
    ]
}

df_comparison = pd.DataFrame(comparison_data)

print("\n" + "="*60)
print("GLiNER v1 vs v2 COMPARISON")
print("="*60)
print(df_comparison.to_string(index=False))
print("="*60)

## Example Predictions - GLiNER v1

In [None]:
# Show example predictions from GLiNER v1
print("Example Predictions - GLiNER v1:\n")

sample_pmids = list(dev_data.keys())[:3]

for pmid in sample_pmids:
    article = dev_data[pmid]
    pred = predictions_v1[pmid]
    
    print(f"Document PMID: {pmid}")
    print(f"Title: {article['metadata']['title'][:80]}...")
    print(f"\nGold entities: {len(article['entities'])}")
    print(f"Predicted entities: {len(pred['entities'])}")
    
    # Show first few predicted entities
    print("\nSample predictions:")
    for entity in pred['entities'][:5]:
        score_str = f" (score: {entity['score']:.2f})" if 'score' in entity else ""
        print(f"  - '{entity['text_span']}' [{entity['label']}]{score_str} in {entity['location']}")
    
    # Calculate match statistics
    gold_set = set()
    for entity in article['entities']:
        gold_set.add((
            entity['start_idx'],
            entity['end_idx'],
            entity['location'],
            entity['text_span'],
            entity['label']
        ))
    
    pred_set = set()
    for entity in pred['entities']:
        pred_set.add((
            entity['start_idx'],
            entity['end_idx'],
            entity['location'],
            entity['text_span'],
            entity['label']
        ))
    
    correct = len(gold_set & pred_set)
    missed = len(gold_set - pred_set)
    wrong = len(pred_set - gold_set)
    
    print(f"\n✓ Correct: {correct}")
    print(f"✗ Missed: {missed}")
    print(f"✗ Wrong: {wrong}")
    print("-" * 80)
    print()

## Example Predictions - GLiNER v2

In [None]:
# Show example predictions from GLiNER v2
print("Example Predictions - GLiNER v2:\n")

sample_pmids = list(dev_data.keys())[:3]

for pmid in sample_pmids:
    article = dev_data[pmid]
    pred = predictions_v2[pmid]
    
    print(f"Document PMID: {pmid}")
    print(f"Title: {article['metadata']['title'][:80]}...")
    print(f"\nGold entities: {len(article['entities'])}")
    print(f"Predicted entities: {len(pred['entities'])}")
    
    # Show first few predicted entities
    print("\nSample predictions:")
    for entity in pred['entities'][:5]:
        score_str = f" (score: {entity['score']:.2f})" if 'score' in entity else ""
        print(f"  - '{entity['text_span']}' [{entity['label']}]{score_str} in {entity['location']}")
    
    # Calculate match statistics
    gold_set = set()
    for entity in article['entities']:
        gold_set.add((
            entity['start_idx'],
            entity['end_idx'],
            entity['location'],
            entity['text_span'],
            entity['label']
        ))
    
    pred_set = set()
    for entity in pred['entities']:
        pred_set.add((
            entity['start_idx'],
            entity['end_idx'],
            entity['location'],
            entity['text_span'],
            entity['label']
        ))
    
    correct = len(gold_set & pred_set)
    missed = len(gold_set - pred_set)
    wrong = len(pred_set - gold_set)
    
    print(f"\n✓ Correct: {correct}")
    print(f"✗ Missed: {missed}")
    print(f"✗ Wrong: {wrong}")
    print("-" * 80)
    print()

## Entity Distribution Analysis

In [None]:
# Count entities by label in gold standard
gold_label_counts = Counter()
for pmid, article in dev_data.items():
    for entity in article['entities']:
        gold_label_counts[entity['label']] += 1

# Count entities by label in GLiNER v1 predictions
pred_v1_label_counts = Counter()
for pmid, pred in predictions_v1.items():
    for entity in pred['entities']:
        pred_v1_label_counts[entity['label']] += 1

# Count entities by label in GLiNER v2 predictions
pred_v2_label_counts = Counter()
for pmid, pred in predictions_v2.items():
    for entity in pred['entities']:
        pred_v2_label_counts[entity['label']] += 1

print("Entity Distribution by Label:")
print("="*80)
print(f"{'Label':<25} {'Gold':<12} {'GLiNER v1':<12} {'GLiNER v2':<12}")
print("-"*80)

all_labels = set(gold_label_counts.keys()) | set(pred_v1_label_counts.keys()) | set(pred_v2_label_counts.keys())
for label in sorted(all_labels):
    print(f"{label:<25} {gold_label_counts[label]:<12} {pred_v1_label_counts[label]:<12} {pred_v2_label_counts[label]:<12}")

print("-"*80)
print(f"{'TOTAL':<25} {sum(gold_label_counts.values()):<12} {sum(pred_v1_label_counts.values()):<12} {sum(pred_v2_label_counts.values()):<12}")

## Threshold Tuning (Optional)

GLiNER confidence threshold can be adjusted to optimize performance.

In [None]:
# Test different thresholds for GLiNER v2 on a subset
print("Testing different confidence thresholds for GLiNER v2...\n")

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
threshold_results = []

# Use a subset for faster testing
subset_size = min(50, len(dev_data))
subset_pmids = list(dev_data.keys())[:subset_size]
subset_data = {pmid: dev_data[pmid] for pmid in subset_pmids}

for threshold in thresholds:
    print(f"Testing threshold {threshold}...")
    
    predictions_test = {}
    for pmid, article in subset_data.items():
        all_entities = []
        
        title = article['metadata']['title']
        title_entities = predict_entities_gliner_v2(
            model_v2, title, ENTITY_LABELS, 
            threshold=threshold, location='title'
        )
        all_entities.extend(title_entities)
        
        abstract = article['metadata']['abstract']
        abstract_entities = predict_entities_gliner_v2(
            model_v2, abstract, ENTITY_LABELS, 
            threshold=threshold, location='abstract'
        )
        all_entities.extend(abstract_entities)
        
        predictions_test[pmid] = {'entities': all_entities}
    
    # Evaluate
    p, r, f1, mp, mr, mf1 = evaluate_ner(predictions_test.copy(), subset_data)
    
    threshold_results.append({
        'threshold': threshold,
        'macro_f1': f1,
        'micro_f1': mf1,
        'macro_precision': p,
        'macro_recall': r
    })

# Display results
df_thresholds = pd.DataFrame(threshold_results)
print("\n" + "="*60)
print("THRESHOLD TUNING RESULTS (on subset)")
print("="*60)
print(df_thresholds.to_string(index=False))
print("\nBest threshold by Micro-F1:", df_thresholds.loc[df_thresholds['micro_f1'].idxmax(), 'threshold'])

## Summary and Conclusions

### Key Observations:

1. **Zero-shot Performance**: GLiNER can perform NER without any training on the target dataset
2. **Version Comparison**: GLiNER v2 typically shows improved performance over v1
3. **Threshold Impact**: The confidence threshold significantly affects precision/recall trade-off
4. **Advantages**:
   - No training required
   - Works with arbitrary entity labels
   - Fast inference
   - Easy to deploy

5. **Limitations**:
   - May not match fine-tuned BERT performance
   - Domain-specific terminology might be challenging
   - Threshold tuning needed for optimal results

### Next Steps:
- Fine-tune GLiNER on training data for better performance
- Experiment with different entity label formulations
- Combine with other models in an ensemble
- Try domain-specific GLiNER variants if available