# SpaCy Term Clustering

This notebook demonstrates two approaches to term clustering using SpaCy:
1. **Baseline**: Similarity-based clustering using SpaCy embeddings
2. **Advanced**: K-means clustering with SpaCy word vectors

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed) - Subtask B

## Setup and Imports
- For complete information about the Italian SpaCy models available: https://spacy.io/models/it
- We'll use `it_core_news_md` or `it_core_news_lg` for word vectors

In [8]:
#!python -m spacy download it_core_news_sm
#!python -m spacy download it_core_news_md
#!python -m spacy download it_core_news_lg

In [9]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
from typing import List, Dict, Tuple
import spacy
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, v_measure_score
from sklearn.metrics import homogeneity_score, completeness_score
from scipy.spatial.distance import cosine
from tqdm import tqdm

# Load Italian model with word vectors
try:
    nlp = spacy.load('it_core_news_md')  # Medium model with vectors
    print(f"✓ Italian model loaded successfully")
    print(f"  Model: it_core_news_md")
    print(f"  Vector dimensions: {nlp.vocab.vectors.shape}")
except:
    try:
        nlp = spacy.load('it_core_news_lg')  # Try large model
        print(f"✓ Italian model loaded successfully")
        print(f"  Model: it_core_news_lg")
        print(f"  Vector dimensions: {nlp.vocab.vectors.shape}")
    except:
        print("Model not found. Install with: python -m spacy download it_core_news_md")
        print("Note: Small model (it_core_news_sm) doesn't have word vectors!")

✓ Italian model loaded successfully
  Model: it_core_news_md
  Vector dimensions: (20000, 300)


## Load Training and Dev Data

In [10]:
data_path = "../data"
train_path = os.path.join(data_path, "subtask_b_train.csv")
dev_path = os.path.join(data_path, "subtask_b_dev.csv")

# Load training data
train_df = pd.read_csv(train_path)
print(f"Training data shape: {train_df.shape}")
print(f"Number of unique clusters: {train_df['cluster'].nunique()}")
print(f"Number of terms: {len(train_df)}")

# Load dev data
dev_df = pd.read_csv(dev_path)
print(f"\nDev data shape: {dev_df.shape}")
print(f"Number of unique clusters in dev: {dev_df['cluster'].nunique()}")
print(f"Number of terms in dev: {len(dev_df)}")

# Create term-to-cluster mapping
term_to_cluster = dict(zip(train_df['term'].str.lower(), train_df['cluster']))

# Create cluster-to-terms mapping for analysis
cluster_to_terms = defaultdict(list)
for term, cluster in term_to_cluster.items():
    cluster_to_terms[cluster].append(term)

print(f"\nExample clusters from training:")
for cluster_id in sorted(cluster_to_terms.keys())[:5]:
    print(f"  Cluster {cluster_id}: {cluster_to_terms[cluster_id]}")

Training data shape: (713, 2)
Number of unique clusters: 299
Number of terms: 713

Dev data shape: (242, 2)
Number of unique clusters in dev: 147
Number of terms in dev: 242

Example clusters from training:
  Cluster 0: ['biodegradabili']
  Cluster 1: ['cassonetti', 'contenitori stradali', 'cassonetti stradali', 'postazioni stradali']
  Cluster 2: ['separare i rifiuti']
  Cluster 3: ['deposito', 'conferimento', 'conferimento delle frazioni', 'conferimento dei rifiuti', 'conferimenti', 'operazioni di conferimento']
  Cluster 4: ['conferiti', 'conferito', 'conferita']


## SpaCy Vector Utilities

Functions to compute term embeddings using SpaCy.

In [11]:
def get_term_vector(term: str, nlp) -> np.ndarray:
    """
    Get average word vector for a term.
    For multi-word terms, average the word vectors.
    """
    doc = nlp(term)
    
    # Get vectors for tokens that have them
    vectors = [token.vector for token in doc if token.has_vector]
    
    if not vectors:
        # Return zero vector if no vectors found
        return np.zeros(nlp.vocab.vectors_length)
    
    # Average the vectors
    return np.mean(vectors, axis=0)


def compute_similarity(term1: str, term2: str, nlp) -> float:
    """
    Compute cosine similarity between two terms using SpaCy vectors.
    """
    vec1 = get_term_vector(term1, nlp)
    vec2 = get_term_vector(term2, nlp)
    
    # Check for zero vectors
    if np.all(vec1 == 0) or np.all(vec2 == 0):
        return 0.0
    
    # Compute cosine similarity (1 - cosine distance)
    return 1.0 - cosine(vec1, vec2)


# Test
test_terms = ['rifiuti', 'spazzatura', 'carta']
print("Testing similarity computation:")
for i, t1 in enumerate(test_terms):
    for t2 in test_terms[i+1:]:
        sim = compute_similarity(t1, t2, nlp)
        print(f"  '{t1}' <-> '{t2}': {sim:.3f}")

Testing similarity computation:
  'rifiuti' <-> 'spazzatura': 0.572
  'rifiuti' <-> 'carta': 0.198
  'spazzatura' <-> 'carta': 0.423


## Baseline: Similarity-Based Clustering

For each dev term, find the most similar training term using SpaCy vectors and assign its cluster.

In [None]:
class SpacyBaselineClustering:
    """Similarity-based clustering using SpaCy word vectors."""
    
    def __init__(self, nlp, similarity_threshold=0.5):
        self.nlp = nlp
        self.similarity_threshold = similarity_threshold
        self.term_vectors = {}
        self.term_to_cluster = {}
    
    def fit(self, terms: List[str], clusters: List[int]):
        """Build the term-cluster mapping and precompute vectors."""
        print("Computing term vectors...")
        for term, cluster in tqdm(zip(terms, clusters), total=len(terms)):
            term_lower = term.lower()
            self.term_to_cluster[term_lower] = cluster
            self.term_vectors[term_lower] = get_term_vector(term_lower, self.nlp)
        
        print(f"Built index with {len(self.term_vectors)} terms")
    
    def predict_one(self, term: str) -> Tuple[int, float]:
        """Predict cluster for a single term."""
        term_lower = term.lower()
        
        # Check for exact match
        if term_lower in self.term_to_cluster:
            return self.term_to_cluster[term_lower], 1.0
        
        # Compute vector for query term
        query_vec = get_term_vector(term_lower, self.nlp)
        
        # Find most similar term
        best_similarity = 0.0
        best_cluster = -1
        
        for train_term, train_vec in self.term_vectors.items():
            # Skip if either vector is zero
            if np.all(query_vec == 0) or np.all(train_vec == 0):
                continue
            
            sim = 1.0 - cosine(query_vec, train_vec)
            if sim > best_similarity:
                best_similarity = sim
                best_cluster = self.term_to_cluster[train_term]
        
        # Check threshold
        if best_similarity < self.similarity_threshold:
            return -1, best_similarity
        
        return best_cluster, best_similarity
    
    def predict(self, terms: List[str]) -> Tuple[List[int], List[float]]:
        """Predict clusters for multiple terms."""
        predictions = []
        similarities = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            cluster, sim = self.predict_one(term)
            predictions.append(cluster)
            similarities.append(sim)
        
        return predictions, similarities


# Test
test_model = SpacyBaselineClustering(nlp, similarity_threshold=0.5)
test_terms_list = ['rifiuti', 'carta', 'plastica']
test_clusters_list = [37, 74, 43]
test_model.fit(test_terms_list, test_clusters_list)
pred_cluster, pred_sim = test_model.predict_one('spazzatura')
print(f"\n Baseline model works!!")
print(f"  'spazzatura' -> Cluster {pred_cluster} (similarity: {pred_sim:.3f})")

Computing term vectors...


100%|██████████| 3/3 [00:00<00:00, 331.08it/s]

Built index with 3 terms

✓ Baseline model works
  'spazzatura' -> Cluster 43 (similarity: 0.676)





### Train and Evaluate Baseline Model

In [13]:
# Initialize baseline model
baseline_model = SpacyBaselineClustering(nlp, similarity_threshold=0.5)

# Fit on training data
baseline_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
baseline_preds, baseline_sims = baseline_model.predict(dev_df['term'].tolist())

# Store results
dev_df['baseline_cluster'] = baseline_preds
dev_df['baseline_similarity'] = baseline_sims

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(baseline_preds)}")
print(f"  Terms with exact match: {sum(1 for s in baseline_sims if s >= 0.99)}")
print(f"  Terms with high similarity (≥0.7): {sum(1 for s in baseline_sims if 0.7 <= s < 0.99)}")
print(f"  Terms with medium similarity (0.5-0.7): {sum(1 for s in baseline_sims if 0.5 <= s < 0.7)}")
print(f"  Terms below threshold (unknown): {sum(1 for p in baseline_preds if p == -1)}")

Computing term vectors...


100%|██████████| 713/713 [00:01<00:00, 378.92it/s]
100%|██████████| 713/713 [00:01<00:00, 378.92it/s]


Built index with 713 terms


Predicting clusters: 100%|██████████| 242/242 [00:01<00:00, 239.95it/s]


Prediction statistics:
  Total terms: 242
  Terms with exact match: 150
  Terms with high similarity (≥0.7): 79
  Terms with medium similarity (0.5-0.7): 6
  Terms below threshold (unknown): 7





## Save Predictions

In [15]:
# Save baseline predictions
output_path_baseline = "predictions/subtask_b_dev_spacy_baseline_preds.csv"
#dev_df[['term', 'baseline_cluster', 'baseline_similarity']].to_csv(output_path_baseline, index=False)
dev_df[['term', 'baseline_cluster']].to_csv(output_path_baseline, index=False)
print(f"Baseline predictions saved to {output_path_baseline}")

Baseline predictions saved to predictions/subtask_b_dev_spacy_baseline_preds.csv


### Evaluate Baseline Model

In [16]:
import numpy as np
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are terms and values are cluster_ids.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    data = {term: int(cluster) for term, cluster in df.itertuples(index=False)}
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {item["term"]: item["cluster"] for item in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

class BCubed_calculator:
  def __init__(self, gold, pred):
    self.gold = gold
    self.pred = pred
    self.gold_cluster = defaultdict(set)
    self.pred_cluster = defaultdict(set)
    for item, clus_id in gold.items():
        self.gold_cluster[clus_id].add(item)
    for item, clus_id in pred.items():
      self.pred_cluster[clus_id].add(item)

  def bc_precision_item(self, item):
    pred_id = self.pred[item]
    gold_id = self.gold.get(item, None)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FP = len(self.pred_cluster[pred_id]) - TP
    return TP/(FP + TP)

  def bc_recall_item(self, item):
    pred_id = self.pred.get(item, None)
    gold_id = self.gold.get(item)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FN = len(self.gold_cluster[gold_id]) - TP
    return TP/(TP + FN)

def bcubed_precision(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_precision_item(item) for item in calc.pred])

def bcubed_recall(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_recall_item(item) for item in calc.gold])

def bcubed_f1(gold, pred):
  return 2 * bcubed_precision(gold, pred) * bcubed_recall(gold, pred) / (bcubed_precision(gold, pred) + bcubed_recall(gold, pred))

In [17]:
preds = load_data("predictions/subtask_b_dev_spacy_baseline_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.7549
BCubed Recall: 0.8733
BCubed F1: 0.8098


## K-Means Clustering with SpaCy Vectors

Train a K-means model on training term vectors, then predict clusters for dev terms.

In [20]:
class SpacyKMeansClustering:
    """K-means clustering using SpaCy word vectors."""
    
    def __init__(self, nlp, n_clusters=None):
        self.nlp = nlp
        self.n_clusters = n_clusters
        self.kmeans = None
        self.cluster_mapping = {}  # Maps learned cluster IDs to gold cluster IDs
    
    def fit(self, terms: List[str], true_clusters: List[int]):
        """Train K-means on term vectors."""
        # Compute vectors for all terms
        print("Computing term vectors...")
        vectors = []
        valid_indices = []
        
        for i, term in enumerate(tqdm(terms)):
            vec = get_term_vector(term.lower(), self.nlp)
            if not np.all(vec == 0):  # Keep only terms with valid vectors
                vectors.append(vec)
                valid_indices.append(i)
        
        vectors = np.array(vectors, dtype=np.float64)  # Convert to float64 for sklearn
        valid_clusters = [true_clusters[i] for i in valid_indices]
        
        print(f"Valid vectors: {len(vectors)} / {len(terms)}")
        
        # Determine number of clusters
        if self.n_clusters is None:
            self.n_clusters = len(set(true_clusters))
        
        print(f"Training K-means with {self.n_clusters} clusters...")
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
        learned_clusters = self.kmeans.fit_predict(vectors)
        
        # Map learned cluster IDs to gold cluster IDs
        # For each learned cluster, find the most common gold cluster
        self.cluster_mapping = {}
        for learned_id in range(self.n_clusters):
            mask = learned_clusters == learned_id
            if sum(mask) > 0:
                gold_clusters_in_learned = [valid_clusters[i] for i, m in enumerate(mask) if m]
                # Most common gold cluster in this learned cluster
                most_common = max(set(gold_clusters_in_learned), key=gold_clusters_in_learned.count)
                self.cluster_mapping[learned_id] = most_common
        
        print(f"Training complete! Built mapping for {len(self.cluster_mapping)} clusters")
    
    def predict(self, terms: List[str]) -> List[int]:
        """Predict clusters for new terms."""
        if self.kmeans is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        predictions = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            vec = get_term_vector(term.lower(), self.nlp)
            
            if np.all(vec == 0):
                predictions.append(-1)  # Unknown
            else:
                vec = np.array([vec], dtype=np.float64)  # Convert to float64 and reshape
                learned_cluster = self.kmeans.predict(vec)[0]
                # Map to gold cluster ID
                gold_cluster = self.cluster_mapping.get(learned_cluster, -1)
                predictions.append(gold_cluster)
        
        return predictions


# Test
test_kmeans = SpacyKMeansClustering(nlp)
test_kmeans.fit(['rifiuti', 'carta', 'plastica'], [37, 74, 43])
test_pred = test_kmeans.predict(['spazzatura'])
print(f"\n K-means model works!!")
print(f"  'spazzatura' -> Cluster {test_pred[0]}")

Computing term vectors...


100%|██████████| 3/3 [00:00<00:00, 247.06it/s]



Valid vectors: 3 / 3
Training K-means with 3 clusters...
Training complete! Built mapping for 3 clusters


Predicting clusters: 100%|██████████| 1/1 [00:00<00:00, 166.48it/s]


 K-means model works!!
  'spazzatura' -> Cluster 43





### Train and Evaluate K-Means Model

In [19]:
# Initialize K-means model
kmeans_model = SpacyKMeansClustering(nlp)

# Fit on training data
kmeans_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
kmeans_preds = kmeans_model.predict(dev_df['term'].tolist())

# Store results
dev_df['kmeans_cluster'] = kmeans_preds

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(kmeans_preds)}")
print(f"  Terms with predictions: {sum(1 for p in kmeans_preds if p != -1)}")
print(f"  Terms without predictions (unknown): {sum(1 for p in kmeans_preds if p == -1)}")

Computing term vectors...


100%|██████████| 713/713 [00:01<00:00, 367.00it/s]



Valid vectors: 702 / 713
Training K-means with 299 clusters...
Training complete! Built mapping for 299 clusters
Training complete! Built mapping for 299 clusters


Predicting clusters: 100%|██████████| 242/242 [00:00<00:00, 309.82it/s]


Prediction statistics:
  Total terms: 242
  Terms with predictions: 239
  Terms without predictions (unknown): 3





## Save Predictions

In [21]:
# Save K-means predictions
output_path_kmeans = "predictions/subtask_b_dev_spacy_kmeans_preds.csv"
#dev_df[['term', 'kmeans_cluster']].to_csv(output_path_kmeans, index=False)
dev_df[['term', 'kmeans_cluster']].to_csv(output_path_kmeans, index=False)
print(f"K-means predictions saved to {output_path_kmeans}")

K-means predictions saved to predictions/subtask_b_dev_spacy_kmeans_preds.csv


### Evaluate K-means Model

In [23]:
preds = load_data("predictions/subtask_b_dev_spacy_kmeans_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.5629
BCubed Recall: 0.7730
BCubed F1: 0.6514


## Model Comparison

In [25]:
# Load predictions and gold standard for both models
gold = load_data("../data/subtask_b_dev.csv")
baseline_preds = load_data("predictions/subtask_b_dev_spacy_baseline_preds.csv")
kmeans_preds = load_data("predictions/subtask_b_dev_spacy_kmeans_preds.csv")

# Compute BCubed metrics for baseline model
baseline_precision = bcubed_precision(gold, baseline_preds)
baseline_recall = bcubed_recall(gold, baseline_preds)
baseline_f1 = bcubed_f1(gold, baseline_preds)

# Compute BCubed metrics for K-means model
kmeans_precision = bcubed_precision(gold, kmeans_preds)
kmeans_recall = bcubed_recall(gold, kmeans_preds)
kmeans_f1 = bcubed_f1(gold, kmeans_preds)

# Create comparison dataframe
comparison_df = pd.DataFrame([
    {
        'Model': 'Baseline (Similarity)',
        'BCubed Precision': baseline_precision,
        'BCubed Recall': baseline_recall,
        'BCubed F1': baseline_f1
    },
    {
        'Model': 'K-Means',
        'BCubed Precision': kmeans_precision,
        'BCubed Recall': kmeans_recall,
        'BCubed F1': kmeans_f1
    }
])

print("\n" + "="*80)
print("MODEL COMPARISON - BCubed Metrics")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Calculate improvements
precision_improvement = (kmeans_precision - baseline_precision) / baseline_precision * 100 if baseline_precision > 0 else 0
recall_improvement = (kmeans_recall - baseline_recall) / baseline_recall * 100 if baseline_recall > 0 else 0
f1_improvement = (kmeans_f1 - baseline_f1) / baseline_f1 * 100 if baseline_f1 > 0 else 0

print(f"\nK-Means vs Baseline:")
print(f"  BCubed Precision improvement: {precision_improvement:+.1f}%")
print(f"  BCubed Recall improvement: {recall_improvement:+.1f}%")
print(f"  BCubed F1 improvement: {f1_improvement:+.1f}%")


MODEL COMPARISON - BCubed Metrics
                Model  BCubed Precision  BCubed Recall  BCubed F1
Baseline (Similarity)          0.754900       0.873318   0.809802
              K-Means          0.562915       0.772963   0.651426

K-Means vs Baseline:
  BCubed Precision improvement: -25.4%
  BCubed Recall improvement: -11.5%
  BCubed F1 improvement: -19.6%


## Possible improvements
- Tune *similarity_threshold* in baseline