# Vanilla Term Extraction Baseline

Simple word-boundary matching baseline:
- Extract all unique terms from training data (lowercased)
- Match terms in dev sentences using word boundaries
- No machine learning, just exact substring matching

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

In [2]:
import pandas as pd
import json
import os

data_path = "../data"
train_data_path = os.path.join(data_path, "subtask_a_train.json")

with open(train_data_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

terms = set()
for entry in train_data['data']:
    for term in entry['term_list']:
        terms.add(term.lower())

terms = sorted(terms)
print(len(terms))

713


## Load Training Data and Extract Terms

In [3]:
import re
from tqdm import tqdm

dev_data_path = "subtask_a_dev.json"

with open(os.path.join(data_path, dev_data_path), "r", encoding="utf-8") as f:
    dev_data = json.load(f)

dump_pred = {"data": []}

for entry in tqdm(dev_data['data'], desc="Processing dev data", total=len(dev_data['data'])):
    append_dict = {
        "document_id": entry['document_id'],
        "paragraph_id": entry['paragraph_id'],
        "sentence_id": entry['sentence_id'],
        "term_list": []
    }
    sentence_text = entry['sentence_text'].lower()
    for term in terms:
        # Use word boundary matching to find complete words only
        pattern = r'\b' + re.escape(term) + r'\b'
        if re.search(pattern, sentence_text):
            append_dict['term_list'].append(term)

    dump_pred['data'].append(append_dict)

with open("predictions/subtask_a_dev_vanilla_preds.json", "w", encoding="utf-8") as f:
    json.dump(dump_pred, f, ensure_ascii=False, indent=4)

Processing dev data:   0%|          | 0/577 [00:00<?, ?it/s]

Processing dev data: 100%|██████████| 577/577 [00:13<00:00, 42.49it/s]


## Predict on Dev Set

## Evaluation Functions

Using the official competition evaluation metrics.

In [4]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for gold, system in zip(gold_standard, system_output):
        gold_set = set(gold)
        system_set = set(system)
        
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    """
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    type_false_positives = len(all_system_terms - all_gold_terms)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1

## Evaluate Performance

In [5]:
# Prepare gold standard and predictions for evaluation
gold_standard = []
system_output = []

# Build mapping from dev_data
dev_gold_map = {}
for entry in dev_data['data']:
    key = (entry['document_id'], entry['paragraph_id'], entry['sentence_id'])
    if key not in dev_gold_map:
        dev_gold_map[key] = entry['term_list']

# Build mapping from predictions
pred_map = {}
for entry in dump_pred['data']:
    key = (entry['document_id'], entry['paragraph_id'], entry['sentence_id'])
    pred_map[key] = entry['term_list']

# Align gold and predictions
for key in sorted(dev_gold_map.keys()):
    gold_standard.append(dev_gold_map[key])
    system_output.append(pred_map.get(key, []))

# Calculate metrics
precision, recall, f1, tp, fp, fn = micro_f1_score(gold_standard, system_output)
type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)

print("="*60)
print("VANILLA BASELINE RESULTS")
print("="*60)
print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")
print("="*60)

VANILLA BASELINE RESULTS

Micro-averaged Metrics:
  Precision: 0.4280
  Recall:    0.7384
  F1 Score:  0.5419
  TP=333, FP=445, FN=118

Type-level Metrics:
  Type Precision: 0.6570
  Type Recall:    0.5620
  Type F1 Score:  0.6058


## Example Predictions

In [6]:
# Show a few example predictions
print("Example Predictions:\n")
for i in range(min(30, len(system_output))):
    if len(gold_standard[i]) > 0:  # Only show examples with gold terms
        entry = dev_data['data'][i]
        print(f"Sentence: {entry['sentence_text']}")
        print(f"Gold terms: {gold_standard[i][:5]}")  # Show first 5
        print(f"Predicted terms: {system_output[i][:5]}")  # Show first 5
        
        correct = set(gold_standard[i]) & set(system_output[i])
        missed = set(gold_standard[i]) - set(system_output[i])
        wrong = set(system_output[i]) - set(gold_standard[i])
        
        print(f"✓ Correct: {len(correct)}")
        print(f"✗ Missed: {len(missed)}")
        print(f"✗ Wrong: {len(wrong)}")
        print("-"*80)
        print()

Example Predictions:

Sentence: - giornali; - la carta per alimenti;
Gold terms: ['servizio di gestione integrata dei rifiuti']
Predicted terms: ['rifiuti']
✓ Correct: 0
✗ Missed: 1
✗ Wrong: 1
--------------------------------------------------------------------------------

Sentence: Indumenti usati, accessori, lenzuola, coperte, asciugamani, tende, tovaglie e tutto ciò che è realizzato in tessuto.
Gold terms: ['servizio di igiene urbana']
Predicted terms: ['igiene urbana', 'servizio di igiene urbana']
✓ Correct: 1
✗ Missed: 0
✗ Wrong: 1
--------------------------------------------------------------------------------

Sentence: OLI E GRASSI VEGETALI ESAUSTI
Gold terms: ['conferire']
Predicted terms: ['conferire']
✓ Correct: 1
✗ Missed: 0
✗ Wrong: 0
--------------------------------------------------------------------------------

Sentence: - abbandono del luogo solo dopo essersi assicurati che le fiamme siano state completamente spente;
Gold terms: ['parte variabile']
Predicted terms: [