# BERT Token Classification for Italian Term Extraction

This notebook demonstrates a BERT-based approach to term extraction:
- Uses BIO tagging scheme (Beginning-Inside-Outside)
- Fine-tunes Italian BERT model for token classification
- Trains on labeled data to recognize term boundaries

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

## Setup and Imports

In [None]:
import json
import os
import numpy as np
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from torch.utils.data import Dataset
import pandas as pd

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Setup complete")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


Setup complete
PyTorch version: 2.7.1+cu118
CUDA available: True


In [2]:
# Define label mappings for BIO tagging scheme
label_list = ['O', 'B-TERM', 'I-TERM']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}

print(f"Labels: {label_list}")
print(f"Label to ID: {label2id}")

# Model configuration
model_name = "dbmdz/bert-base-italian-uncased"
output_model_dir = "models/bert_token_classification"

print(f"\nModel: {model_name}")
print(f"Output directory: {output_model_dir}")

Labels: ['O', 'B-TERM', 'I-TERM']
Label to ID: {'O': 0, 'B-TERM': 1, 'I-TERM': 2}

Model: dbmdz/bert-base-italian-uncased
Output directory: models/bert_token_classification


## Data Loading and Processing

In [3]:
def load_jsonl(path: str):
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records):
    """Convert dataset rows into list of sentences with aggregated terms."""
    out = {}
    
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


print("✓ Data loading functions defined")

✓ Data loading functions defined


In [5]:
# Load training and dev data
train_data = load_jsonl('../data/subtask_a_train.json')
dev_data = load_jsonl('../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {train_sentences[6]['sentence_text']}")
print(f"  Terms: {train_sentences[6]['terms']}")

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: AFFIDAMENTO DEL “SERVIZIO DI SPAZZAMENTO, RACCOLTA, TRASPORTO E SMALTIMENTO/RECUPERO DEI RIFIUTI URBANI ED ASSIMILATI E SERVIZI COMPLEMENTARI DELLA CITTA' DI AGROPOLI” VALEVOLE PER UN QUINQUENNIO
  Terms: ['raccolta', 'recupero', 'servizio di raccolta', 'servizio di spazzamento', 'smaltimento', 'trasporto']


## Evaluation Metrics

Using the official evaluation metrics from the competition.

In [6]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for gold, system in zip(gold_standard, system_output):
        gold_set = set(gold)
        system_set = set(system)
        
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    """
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    type_false_positives = len(all_system_terms - all_gold_terms)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1


print("✓ Evaluation functions defined")

✓ Evaluation functions defined


## Initialize BERT Model and Tokenizer

In [9]:
# Initialize tokenizer and model
print("Initializing BERT tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    num_labels=len(label_list), 
    id2label=id2label, 
    label2id=label2id
)

print(f"✓ Tokenizer loaded: {tokenizer.__class__.__name__}")
print(f"✓ Model loaded with {model.num_labels} labels")
print(f"  Vocabulary size: {tokenizer.vocab_size}")

# Test tokenization
sample_text = "Il servizio era eccellente e il cibo delizioso."
tokens = tokenizer.tokenize(sample_text)
print(f"\nSample tokenization: {tokens}...")

Initializing BERT tokenizer and model...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Tokenizer loaded: BertTokenizerFast
✓ Model loaded with 3 labels
  Vocabulary size: 31102

Sample tokenization: ['il', 'servizio', 'era', 'eccellente', 'e', 'il', 'cibo', 'delizioso', '.']...


## BIO Tag Generation for Training Data

In [10]:
def create_ner_tags(text, terms, tokenizer, label2id):
    """
    Create NER tags for tokenized text based on the given terms.
    Uses BIO scheme: B-TERM for beginning, I-TERM for inside, O for outside.
    """
    tokens = tokenizer.tokenize(text)
    ner_tags = ['O'] * len(tokens)
    
    # Sort terms by length (descending) to handle overlapping terms
    sorted_terms = sorted(terms, key=len, reverse=True)
    
    for term in sorted_terms:
        normalized_text = text.lower()
        normalized_term = term.lower()
        
        # Find all occurrences of the term
        term_positions = []
        start = 0
        while True:
            pos = normalized_text.find(normalized_term, start)
            if pos == -1:
                break
            term_positions.append((pos, pos + len(normalized_term)))
            start = pos + 1
        
        # Map character positions to token positions
        for start_char, end_char in term_positions:
            token_start_idx = None
            token_end_idx = None
            
            encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
            offsets = encoding['offset_mapping']
            
            for i, (token_start, token_end) in enumerate(offsets):
                if token_start < end_char and token_end > start_char:
                    if token_start_idx is None:
                        token_start_idx = i
                    token_end_idx = i
            
            # Apply BIO tagging
            if token_start_idx is not None and token_end_idx is not None:
                for i in range(token_start_idx, token_end_idx + 1):
                    if i < len(ner_tags) and ner_tags[i] == 'O':
                        if i == token_start_idx:
                            ner_tags[i] = 'B-TERM'
                        else:
                            ner_tags[i] = 'I-TERM'
    
    ner_tag_ids = [label2id[tag] for tag in ner_tags]
    return tokens, ner_tag_ids


print("✓ BIO tag generation function defined")

✓ BIO tag generation function defined


## Process Training and Dev Data with BIO Tags

In [22]:
import pandas as pd
# Process training data
print("Processing training data...")
for i, entry in enumerate(train_sentences):
    text = entry['sentence_text']
    terms = entry['terms']
    
    tokens, ner_tags = create_ner_tags(text, terms, tokenizer, label2id)
    entry['tokens'] = tokens
    entry['ner_tags'] = ner_tags
    
    if i % 1000 == 0:
        print(f"  Processed {i}/{len(train_sentences)}")

print(f"✓ Training data processed: {len(train_sentences)} sentences")

# Process dev data
print("\nProcessing dev data...")
for i, entry in enumerate(dev_sentences):
    text = entry['sentence_text']
    terms = entry['terms']
    
    tokens, ner_tags = create_ner_tags(text, terms, tokenizer, label2id)
    entry['tokens'] = tokens
    entry['ner_tags'] = ner_tags
    
    if i % 200 == 0:
        print(f"  Processed {i}/{len(dev_sentences)}")

print(f"✓ Dev data processed: {len(dev_sentences)} sentences")

print(f"\nSample train sentence:")
print(f"  Text: {train_sentences[6]['sentence_text']}")
print(f"  Terms: {train_sentences[6]['terms']}")
token_tags = []
for token, tag in zip(train_sentences[6]['tokens'], train_sentences[6]['ner_tags']):
    token_tags.append((token, id2label[tag]))
print(f"\n{pd.DataFrame(token_tags, columns=['Token', 'Tag']).to_markdown()}")

Processing training data...
  Processed 0/2308
  Processed 1000/2308
  Processed 2000/2308
✓ Training data processed: 2308 sentences

Processing dev data...
  Processed 0/577
  Processed 200/577
  Processed 400/577
✓ Dev data processed: 577 sentences

Sample train sentence:
  Text: AFFIDAMENTO DEL “SERVIZIO DI SPAZZAMENTO, RACCOLTA, TRASPORTO E SMALTIMENTO/RECUPERO DEI RIFIUTI URBANI ED ASSIMILATI E SERVIZI COMPLEMENTARI DELLA CITTA' DI AGROPOLI” VALEVOLE PER UN QUINQUENNIO
  Terms: ['raccolta', 'recupero', 'servizio di raccolta', 'servizio di spazzamento', 'smaltimento', 'trasporto']

|    | Token         | Tag    |
|---:|:--------------|:-------|
|  0 | affidamento   | O      |
|  1 | del           | O      |
|  2 | “             | O      |
|  3 | servizio      | B-TERM |
|  4 | di            | I-TERM |
|  5 | spa           | I-TERM |
|  6 | ##zzamento    | I-TERM |
|  7 | ,             | O      |
|  8 | raccolta      | B-TERM |
|  9 | ,             | O      |
| 10 | trasporto     | 

## Prepare Dataset for BERT Training

In [12]:
class TokenClassificationDataset(Dataset):
    """Custom dataset for token classification with BERT."""
    
    def __init__(self, texts, tokens, labels, tokenizer, max_length=512):
        self.texts = texts
        self.tokens = tokens
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        original_labels = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        # Initialize labels with -100 (ignored by loss)
        labels = torch.full((self.max_length,), -100, dtype=torch.long)
        
        # Align original labels with tokenized input
        for i, token_id in enumerate(input_ids):
            if i == 0:  # CLS token
                labels[i] = -100
            elif token_id == self.tokenizer.sep_token_id:  # SEP token
                labels[i] = -100
            elif token_id == self.tokenizer.pad_token_id:  # PAD token
                labels[i] = -100
            else:
                original_idx = i - 1
                if original_idx < len(original_labels):
                    labels[i] = original_labels[original_idx]
                else:
                    labels[i] = -100
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


print("✓ Custom dataset class defined")

✓ Custom dataset class defined


In [23]:
# Create datasets
print("Creating training datasets...")

train_dataset = TokenClassificationDataset(
    texts=[entry['sentence_text'] for entry in train_sentences],
    tokens=[entry['tokens'] for entry in train_sentences],
    labels=[entry['ner_tags'] for entry in train_sentences],
    tokenizer=tokenizer
)

dev_dataset = TokenClassificationDataset(
    texts=[entry['sentence_text'] for entry in dev_sentences],
    tokens=[entry['tokens'] for entry in dev_sentences],
    labels=[entry['ner_tags'] for entry in dev_sentences],
    tokenizer=tokenizer
)

print(f"✓ Training dataset: {len(train_dataset)} examples")
print(f"✓ Dev dataset: {len(dev_dataset)} examples")

Creating training datasets...
✓ Training dataset: 2308 examples
✓ Dev dataset: 577 examples


## Configure Training Arguments

In [26]:
# Setup data collator for token classification
# Data collator is used to dynamically pad inputs and labels
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)
print("✓ Data collator initialized")

✓ Data collator initialized


In [31]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_model_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100,
    save_total_limit=2,
    seed=42,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

print("✓ Training configuration ready")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Learning rate: {training_args.learning_rate}")

✓ Training configuration ready
  Batch size: 16
  Epochs: 3
  Learning rate: 2e-05


## Train BERT Model

Note: This cell might take several minutes to run.


**Additional configurations to test**
- Aggregate training samples per paragraph/document
- Change hyperparameters (*learning_rate*, *batch_size*, *num_train_epochs*, *weight_decay*)

In [32]:
# Initialize Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("✓ Trainer initialized")
print(f"  Training samples: {len(train_dataset)}")
print(f"  Evaluation samples: {len(dev_dataset)}")

Initializing Trainer...
✓ Trainer initialized
  Training samples: 2308
  Evaluation samples: 577


In [33]:
# Start training
print("="*60)
print("Starting model training...")
print("="*60)

import time
training_start_time = time.time()

train_result = trainer.train()

training_duration = time.time() - training_start_time

print("\n" + "="*60)
print("✓ TRAINING COMPLETED!")
print("="*60)
print(f"Training time: {training_duration/60:.2f} minutes")

Starting model training...


RuntimeError: NCCL Error 2: unhandled system error (run with NCCL_DEBUG=INFO for details)

## Save Trained Model

In [None]:
# Save the trained model
print("Saving trained model...")

os.makedirs(output_model_dir, exist_ok=True)
trainer.save_model(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print(f"✓ Model saved to: {output_model_dir}")

## Inference Function

In [None]:
# Load the trained model for inference
print("Loading trained model for inference...")

inference_model = AutoModelForTokenClassification.from_pretrained(output_model_dir)
inference_tokenizer = AutoTokenizer.from_pretrained(output_model_dir)
inference_model.eval()

print(f"✓ Model loaded from: {output_model_dir}")

## Predict on Dev Set

In [None]:
def perform_inference(model, tokenizer, text, id2label):
    """Perform token classification inference on a single text."""
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True
    )
    
    with torch.no_grad():
        outputs = model(**{k: v for k, v in inputs.items() if k != 'offset_mapping'})
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_labels = torch.argmax(predictions, dim=-1)
    
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [id2label[pred.item()] for pred in predicted_labels[0]]
    
    # Extract terms using BIO scheme
    predicted_terms = []
    current_term = []
    
    for token, label in zip(tokens, labels):
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            continue
            
        if label == 'B-TERM':
            if current_term:
                predicted_terms.append(tokenizer.convert_tokens_to_string(current_term))
            current_term = [token]
        elif label == 'I-TERM' and current_term:
            current_term.append(token)
        else:
            if current_term:
                predicted_terms.append(tokenizer.convert_tokens_to_string(current_term))
                current_term = []
    
    if current_term:
        predicted_terms.append(tokenizer.convert_tokens_to_string(current_term))
    
    # Clean predicted terms
    predicted_terms = [term.replace(' ##', '').strip() for term in predicted_terms if term.strip()]
    
    return predicted_terms


print("✓ Inference function defined")

In [None]:
# Run inference on all dev sentences
print("Running inference on dev set...")
bert_preds = []

for i, sentence in enumerate(dev_sentences):
    if i % 200 == 0:
        print(f"  Processing {i}/{len(dev_sentences)}")
    
    predicted_terms = perform_inference(
        inference_model,
        inference_tokenizer,
        sentence['sentence_text'],
        id2label
    )
    bert_preds.append(predicted_terms)

print(f"✓ Inference completed: {len(bert_preds)} predictions")

In [None]:
# Prepare gold standard and predictions for evaluation
dev_gold = [s['terms'] for s in dev_sentences]

# Evaluate using competition metrics
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, bert_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, bert_preds)

print("\n" + "="*60)
print("BERT TOKEN CLASSIFICATION RESULTS")
print("="*60)
print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")
print("="*60)

In [None]:
# Save predictions in competition format
def save_predictions(predictions, sentences, output_path):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved {len(predictions)} predictions to {output_path}")


save_predictions(bert_preds, dev_sentences, 'predictions/subtask_a_dev_bert_token_classification_preds.json')

## Example Predictions

In [None]:
# Show example predictions
print("Example Predictions:\n")

count = 0
for i in range(len(dev_sentences)):
    if len(dev_gold[i]) > 0 and count < 5:
        print(f"Sentence: {dev_sentences[i]['sentence_text'][:100]}...")
        print(f"Gold terms: {dev_gold[i][:5]}")
        print(f"BERT predictions: {bert_preds[i][:5]}")
        
        correct = set(dev_gold[i]) & set(bert_preds[i])
        missed = set(dev_gold[i]) - set(bert_preds[i])
        wrong = set(bert_preds[i]) - set(dev_gold[i])
        
        print(f"✓ Correct: {len(correct)}")
        print(f"✗ Missed: {len(missed)}")
        print(f"✗ Wrong: {len(wrong)}")
        print("-"*80)
        print()
        
        count += 1