# BERT-Based Term Clustering

This notebook demonstrates BERT-based approaches to term clustering:
1. **Baseline**: Similarity-based clustering using BERT embeddings
2. **Advanced**: K-means clustering with BERT contextual embeddings

Uses Italian BERT model to generate contextualized term representations.

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed) - Subtask B

## Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from collections import defaultdict
from typing import List, Dict, Tuple

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, v_measure_score
from sklearn.metrics import homogeneity_score, completeness_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print("Setup complete")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


Setup complete
PyTorch version: 2.9.0+cpu
CUDA available: False


## Load BERT Model

Using Italian BERT for contextual embeddings.

In [3]:
# Load Italian BERT model
model_name = "dbmdz/bert-base-italian-uncased"

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

# Set to evaluation mode
bert_model.eval()

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

print(f"Model loaded: {bert_model.__class__.__name__}")
print(f"Device: {device}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

Loading model: dbmdz/bert-base-italian-uncased
Model loaded: BertModel
Device: cpu
Tokenizer vocab size: 31102
Model loaded: BertModel
Device: cpu
Tokenizer vocab size: 31102


## Load Training and Dev Data

In [5]:
data_path = "../data"
train_path = os.path.join(data_path, "subtask_b_train.csv")
dev_path = os.path.join(data_path, "subtask_b_dev.csv")

# Load training data
train_df = pd.read_csv(train_path)
print(f"Training data shape: {train_df.shape}")
print(f"Number of unique clusters: {train_df['cluster'].nunique()}")
print(f"Number of terms: {len(train_df)}")

# Load dev data
dev_df = pd.read_csv(dev_path)
print(f"\nDev data shape: {dev_df.shape}")
print(f"Number of unique clusters in dev: {dev_df['cluster'].nunique()}")
print(f"Number of terms in dev: {len(dev_df)}")

# Create term-to-cluster mapping
term_to_cluster = dict(zip(train_df['term'].str.lower(), train_df['cluster']))

# Create cluster-to-terms mapping for analysis
cluster_to_terms = defaultdict(list)
for term, cluster in term_to_cluster.items():
    cluster_to_terms[cluster].append(term)

print(f"\nExample clusters from training:")
for cluster_id in sorted(cluster_to_terms.keys())[:5]:
    print(f"  Cluster {cluster_id}: {cluster_to_terms[cluster_id]}")

Training data shape: (713, 2)
Number of unique clusters: 299
Number of terms: 713

Dev data shape: (242, 2)
Number of unique clusters in dev: 147
Number of terms in dev: 242

Example clusters from training:
  Cluster 0: ['biodegradabili']
  Cluster 1: ['cassonetti', 'contenitori stradali', 'cassonetti stradali', 'postazioni stradali']
  Cluster 2: ['separare i rifiuti']
  Cluster 3: ['deposito', 'conferimento', 'conferimento delle frazioni', 'conferimento dei rifiuti', 'conferimenti', 'operazioni di conferimento']
  Cluster 4: ['conferiti', 'conferito', 'conferita']


## BERT Embedding Functions

In [6]:
def get_bert_embedding(text: str, model, tokenizer, device) -> np.ndarray:
    """
    Get BERT embedding for a text.
    Returns the [CLS] token representation from the last hidden layer.
    """
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use [CLS] token embedding from last hidden state
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return cls_embedding[0]


def get_bert_mean_pooling(text: str, model, tokenizer, device) -> np.ndarray:
    """
    Get BERT embedding using mean pooling over all tokens.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling over all tokens
        token_embeddings = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        
        # Apply attention mask for mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled = (sum_embeddings / sum_mask).cpu().numpy()
    
    return mean_pooled[0]


# Test embedding functions
test_text = "raccolta differenziata"
test_embedding = get_bert_embedding(test_text, bert_model, tokenizer, device)
print(f" BERT embedding functions work!!")
print(f"  Test text: '{test_text}'")
print(f"  Embedding shape: {test_embedding.shape}")
print(f"  Embedding sample: {test_embedding[:5]}")

 BERT embedding functions work!!
  Test text: 'raccolta differenziata'
  Embedding shape: (768,)
  Embedding sample: [ 0.12397581  0.08731506  0.41113496  0.20128354 -0.12553388]


## Baseline: BERT Similarity-Based Clustering

For each dev term, find the most similar training term using BERT embeddings.

In [7]:
class BERTBaselineClustering:
    """Similarity-based clustering using BERT embeddings."""
    
    def __init__(self, bert_model, tokenizer, device, similarity_threshold=0.7, use_mean_pooling=False):
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.device = device
        self.similarity_threshold = similarity_threshold
        self.use_mean_pooling = use_mean_pooling
        self.term_embeddings = {}
        self.term_to_cluster = {}
    
    def _get_embedding(self, text: str) -> np.ndarray:
        """Get BERT embedding for text."""
        if self.use_mean_pooling:
            return get_bert_mean_pooling(text, self.bert_model, self.tokenizer, self.device)
        else:
            return get_bert_embedding(text, self.bert_model, self.tokenizer, self.device)
    
    def fit(self, terms: List[str], clusters: List[int]):
        """Precompute embeddings for training terms."""
        print("Computing BERT embeddings for training terms...")
        for term, cluster in tqdm(zip(terms, clusters), total=len(terms)):
            term_lower = term.lower()
            self.term_to_cluster[term_lower] = cluster
            self.term_embeddings[term_lower] = self._get_embedding(term_lower)
        
        print(f"Built index with {len(self.term_embeddings)} terms")
    
    def predict_one(self, term: str) -> Tuple[int, float]:
        """Predict cluster for a single term."""
        term_lower = term.lower()
        
        # Check for exact match
        if term_lower in self.term_to_cluster:
            return self.term_to_cluster[term_lower], 1.0
        
        # Get embedding for query term
        query_embedding = self._get_embedding(term_lower).reshape(1, -1)
        
        # Compute similarities with all training terms
        best_similarity = 0.0
        best_cluster = -1
        
        for train_term, train_embedding in self.term_embeddings.items():
            train_emb_reshaped = train_embedding.reshape(1, -1)
            sim = cosine_similarity(query_embedding, train_emb_reshaped)[0, 0]
            
            if sim > best_similarity:
                best_similarity = sim
                best_cluster = self.term_to_cluster[train_term]
        
        # Check threshold
        if best_similarity < self.similarity_threshold:
            return -1, best_similarity
        
        return best_cluster, best_similarity
    
    def predict(self, terms: List[str]) -> Tuple[List[int], List[float]]:
        """Predict clusters for multiple terms."""
        predictions = []
        similarities = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            cluster, sim = self.predict_one(term)
            predictions.append(cluster)
            similarities.append(sim)
        
        return predictions, similarities
    
    def save(self, path: str):
        """Save model to disk."""
        os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
        model_data = {
            'similarity_threshold': self.similarity_threshold,
            'use_mean_pooling': self.use_mean_pooling,
            'term_embeddings': self.term_embeddings,
            'term_to_cluster': self.term_to_cluster
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load model from disk."""
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        self.similarity_threshold = model_data['similarity_threshold']
        self.use_mean_pooling = model_data['use_mean_pooling']
        self.term_embeddings = model_data['term_embeddings']
        self.term_to_cluster = model_data['term_to_cluster']
        print(f"Model loaded from {path}")


# Test
test_model = BERTBaselineClustering(bert_model, tokenizer, device, similarity_threshold=0.7)
test_terms_list = ['rifiuti', 'carta', 'plastica']
test_clusters_list = [37, 74, 43]
test_model.fit(test_terms_list, test_clusters_list)
pred_cluster, pred_sim = test_model.predict_one('spazzatura')
print(f"\n Baseline model works!!")
print(f"  'spazzatura' -> Cluster {pred_cluster} (similarity: {pred_sim:.3f})")

Computing BERT embeddings for training terms...


100%|██████████| 3/3 [00:00<00:00, 18.72it/s]

Built index with 3 terms

 Baseline model works!!
  'spazzatura' -> Cluster 37 (similarity: 0.956)





### Train and Evaluate Baseline Model

In [8]:
# Initialize baseline model
baseline_model = BERTBaselineClustering(
    bert_model, 
    tokenizer, 
    device, 
    similarity_threshold=0.7,
    use_mean_pooling=False  # Use CLS token
)

# Fit on training data
baseline_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
baseline_preds, baseline_sims = baseline_model.predict(dev_df['term'].tolist())

# Store results
dev_df['baseline_cluster'] = baseline_preds
dev_df['baseline_similarity'] = baseline_sims

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(baseline_preds)}")
print(f"  Terms with exact match: {sum(1 for s in baseline_sims if s >= 0.99)}")
print(f"  Terms with high similarity (≥0.85): {sum(1 for s in baseline_sims if 0.85 <= s < 0.99)}")
print(f"  Terms with medium similarity (0.7-0.85): {sum(1 for s in baseline_sims if 0.7 <= s < 0.85)}")
print(f"  Terms below threshold (unknown): {sum(1 for p in baseline_preds if p == -1)}")

Computing BERT embeddings for training terms...


100%|██████████| 713/713 [00:15<00:00, 47.07it/s]
100%|██████████| 713/713 [00:15<00:00, 47.07it/s]


Built index with 713 terms


Predicting clusters: 100%|██████████| 242/242 [00:12<00:00, 18.81it/s]


Prediction statistics:
  Total terms: 242
  Terms with exact match: 140
  Terms with high similarity (≥0.85): 100
  Terms with medium similarity (0.7-0.85): 2
  Terms below threshold (unknown): 0





## Save Models and Predictions

In [None]:
# Save baseline model
baseline_model.save('models/bert_clustering_baseline.pkl')

# Save predictions
output_path_baseline = "predictions/subtask_b_dev_bert_baseline_preds.csv"
#dev_df[['term', 'baseline_cluster', 'baseline_similarity']].to_csv(output_path_baseline, index=False)
dev_df[['term', 'baseline_cluster']].to_csv(output_path_baseline, index=False)
print(f"\nBaseline predictions saved to {output_path_baseline}")

Model saved to models/bert_clustering_baseline.pkl
Model saved to models/bert_clustering_kmeans.pkl

Baseline predictions saved to predictions/subtask_b_dev_bert_baseline_preds.csv
K-means predictions saved to predictions/subtask_b_dev_bert_kmeans_preds.csv


### Evaluate Baseline Model

In [10]:
import numpy as np
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are terms and values are cluster_ids.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    data = {term: int(cluster) for term, cluster in df.itertuples(index=False)}
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {item["term"]: item["cluster"] for item in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

class BCubed_calculator:
  def __init__(self, gold, pred):
    self.gold = gold
    self.pred = pred
    self.gold_cluster = defaultdict(set)
    self.pred_cluster = defaultdict(set)
    for item, clus_id in gold.items():
        self.gold_cluster[clus_id].add(item)
    for item, clus_id in pred.items():
      self.pred_cluster[clus_id].add(item)

  def bc_precision_item(self, item):
    pred_id = self.pred[item]
    gold_id = self.gold.get(item, None)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FP = len(self.pred_cluster[pred_id]) - TP
    return TP/(FP + TP)

  def bc_recall_item(self, item):
    pred_id = self.pred.get(item, None)
    gold_id = self.gold.get(item)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FN = len(self.gold_cluster[gold_id]) - TP
    return TP/(TP + FN)

def bcubed_precision(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_precision_item(item) for item in calc.pred])

def bcubed_recall(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_recall_item(item) for item in calc.gold])

def bcubed_f1(gold, pred):
  return 2 * bcubed_precision(gold, pred) * bcubed_recall(gold, pred) / (bcubed_precision(gold, pred) + bcubed_recall(gold, pred))

In [11]:
preds = load_data("predictions/subtask_b_dev_bert_baseline_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.7428
BCubed Recall: 0.8621
BCubed F1: 0.7980


## K-Means with BERT Embeddings

Train K-means clustering on BERT embeddings of training terms.

In [12]:
class BERTKMeansClustering:
    """K-means clustering using BERT embeddings."""
    
    def __init__(self, bert_model, tokenizer, device, n_clusters=None, use_mean_pooling=False):
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.device = device
        self.n_clusters = n_clusters
        self.use_mean_pooling = use_mean_pooling
        self.kmeans = None
        self.cluster_mapping = {}  # Maps learned cluster IDs to gold cluster IDs
    
    def _get_embedding(self, text: str) -> np.ndarray:
        """Get BERT embedding for text."""
        if self.use_mean_pooling:
            return get_bert_mean_pooling(text, self.bert_model, self.tokenizer, self.device)
        else:
            return get_bert_embedding(text, self.bert_model, self.tokenizer, self.device)
    
    def fit(self, terms: List[str], true_clusters: List[int]):
        """Train K-means on BERT embeddings."""
        print("Computing BERT embeddings for training terms...")
        embeddings = []
        
        for term in tqdm(terms):
            emb = self._get_embedding(term.lower())
            embeddings.append(emb)
        
        embeddings = np.array(embeddings, dtype=np.float64)
        print(f"Embeddings shape: {embeddings.shape}")
        
        # Determine number of clusters
        if self.n_clusters is None:
            self.n_clusters = len(set(true_clusters))
        
        print(f"Training K-means with {self.n_clusters} clusters...")
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
        learned_clusters = self.kmeans.fit_predict(embeddings)
        
        # Map learned cluster IDs to gold cluster IDs
        self.cluster_mapping = {}
        for learned_id in range(self.n_clusters):
            mask = learned_clusters == learned_id
            if sum(mask) > 0:
                gold_clusters_in_learned = [true_clusters[i] for i, m in enumerate(mask) if m]
                # Most common gold cluster in this learned cluster
                most_common = max(set(gold_clusters_in_learned), key=gold_clusters_in_learned.count)
                self.cluster_mapping[learned_id] = most_common
        
        print(f"Training complete! Built mapping for {len(self.cluster_mapping)} clusters")
    
    def predict(self, terms: List[str]) -> List[int]:
        """Predict clusters for new terms."""
        if self.kmeans is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        predictions = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            emb = self._get_embedding(term.lower())
            emb = np.array([emb], dtype=np.float64)
            learned_cluster = self.kmeans.predict(emb)[0]
            # Map to gold cluster ID
            gold_cluster = self.cluster_mapping.get(learned_cluster, -1)
            predictions.append(gold_cluster)
        
        return predictions
    
    def save(self, path: str):
        """Save model to disk."""
        os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
        model_data = {
            'n_clusters': self.n_clusters,
            'use_mean_pooling': self.use_mean_pooling,
            'kmeans': self.kmeans,
            'cluster_mapping': self.cluster_mapping
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load model from disk."""
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        self.n_clusters = model_data['n_clusters']
        self.use_mean_pooling = model_data['use_mean_pooling']
        self.kmeans = model_data['kmeans']
        self.cluster_mapping = model_data['cluster_mapping']
        print(f"Model loaded from {path}")


# Test
test_kmeans = BERTKMeansClustering(bert_model, tokenizer, device)
test_kmeans.fit(['rifiuti', 'carta', 'plastica'], [37, 74, 43])
test_pred = test_kmeans.predict(['spazzatura'])
print(f"\n K-means model works!!")
print(f"  'spazzatura' -> Cluster {test_pred[0]}")

Computing BERT embeddings for training terms...


100%|██████████| 3/3 [00:00<00:00, 32.78it/s]

Embeddings shape: (3, 768)
Training K-means with 3 clusters...





Training complete! Built mapping for 3 clusters


Predicting clusters: 100%|██████████| 1/1 [00:00<00:00, 41.84it/s]


 K-means model works!!
  'spazzatura' -> Cluster 37





### Train and Evaluate K-Means Model

In [13]:
# Initialize K-means model
kmeans_model = BERTKMeansClustering(
    bert_model, 
    tokenizer, 
    device,
    use_mean_pooling=False
)

# Fit on training data
kmeans_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
kmeans_preds = kmeans_model.predict(dev_df['term'].tolist())

# Store results
dev_df['kmeans_cluster'] = kmeans_preds

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(kmeans_preds)}")
print(f"  Terms with predictions: {sum(1 for p in kmeans_preds if p != -1)}")
print(f"  Terms without predictions (unknown): {sum(1 for p in kmeans_preds if p == -1)}")

Computing BERT embeddings for training terms...


100%|██████████| 713/713 [00:14<00:00, 48.95it/s]



Embeddings shape: (713, 768)
Training K-means with 299 clusters...
Training complete! Built mapping for 299 clusters
Training complete! Built mapping for 299 clusters


Predicting clusters: 100%|██████████| 242/242 [00:04<00:00, 50.06it/s]


Prediction statistics:
  Total terms: 242
  Terms with predictions: 242
  Terms without predictions (unknown): 0





## Save Models and Predictions

In [None]:
# Save K-means model
kmeans_model.save('models/bert_clustering_kmeans.pkl')

# Save predictions
output_path_kmeans = "predictions/subtask_b_dev_bert_kmeans_preds.csv"
dev_df[['term', 'kmeans_cluster']].to_csv(output_path_kmeans, index=False)
print(f"K-means predictions saved to {output_path_kmeans}")

Model saved to models/bert_clustering_kmeans.pkl
K-means predictions saved to predictions/subtask_b_dev_bert_kmeans_preds.csv


### Evaluate K-Means Model

In [15]:
preds = load_data("predictions/subtask_b_dev_bert_kmeans_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.5389
BCubed Recall: 0.7742
BCubed F1: 0.6355


## Model Comparison

In [16]:
# Load predictions and gold standard for both models
gold = load_data("../data/subtask_b_dev.csv")
baseline_preds = load_data("predictions/subtask_b_dev_bert_baseline_preds.csv")
kmeans_preds = load_data("predictions/subtask_b_dev_bert_kmeans_preds.csv")

# Compute BCubed metrics for baseline model
baseline_precision = bcubed_precision(gold, baseline_preds)
baseline_recall = bcubed_recall(gold, baseline_preds)
baseline_f1 = bcubed_f1(gold, baseline_preds)

# Compute BCubed metrics for K-means model
kmeans_precision = bcubed_precision(gold, kmeans_preds)
kmeans_recall = bcubed_recall(gold, kmeans_preds)
kmeans_f1 = bcubed_f1(gold, kmeans_preds)

# Create comparison dataframe
comparison_df = pd.DataFrame([
    {
        'Model': 'Baseline (Similarity)',
        'BCubed Precision': baseline_precision,
        'BCubed Recall': baseline_recall,
        'BCubed F1': baseline_f1
    },
    {
        'Model': 'K-Means',
        'BCubed Precision': kmeans_precision,
        'BCubed Recall': kmeans_recall,
        'BCubed F1': kmeans_f1
    }
])

print("\n" + "="*80)
print("MODEL COMPARISON - BCubed Metrics")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Calculate improvements
precision_improvement = (kmeans_precision - baseline_precision) / baseline_precision * 100 if baseline_precision > 0 else 0
recall_improvement = (kmeans_recall - baseline_recall) / baseline_recall * 100 if baseline_recall > 0 else 0
f1_improvement = (kmeans_f1 - baseline_f1) / baseline_f1 * 100 if baseline_f1 > 0 else 0

print(f"\nK-Means vs Baseline:")
print(f"  BCubed Precision improvement: {precision_improvement:+.1f}%")
print(f"  BCubed Recall improvement: {recall_improvement:+.1f}%")
print(f"  BCubed F1 improvement: {f1_improvement:+.1f}%")


MODEL COMPARISON - BCubed Metrics
                Model  BCubed Precision  BCubed Recall  BCubed F1
Baseline (Similarity)          0.742788       0.862082   0.798001
              K-Means          0.538905       0.774183   0.635466

K-Means vs Baseline:
  BCubed Precision improvement: -27.4%
  BCubed Recall improvement: -10.2%
  BCubed F1 improvement: -20.4%
