# Vanilla Term Clustering Baseline

Simple similarity-based clustering baseline:
- Load training data with term-cluster mappings
- For each dev term, find the most similar training term
- Assign the cluster of the most similar training term
- Use simple string similarity metrics (exact match, substring, word overlap)

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed) - Subtask B

## Import Libraries

In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict
from difflib import SequenceMatcher
import os

## Load Training Data

Load the training set and build a mapping of terms to clusters.

In [6]:
data_path = "../data"
train_path = os.path.join(data_path, "subtask_b_train.csv")
dev_path = os.path.join(data_path, "subtask_b_dev.csv")

# Load training data
train_df = pd.read_csv(train_path)
print(f"Training data shape: {train_df.shape}")
print(f"Number of unique clusters: {train_df['cluster'].nunique()}")
print(f"Number of terms: {len(train_df)}")

# Create term-to-cluster mapping
term_to_cluster = dict(zip(train_df['term'].str.lower(), train_df['cluster']))

# Create cluster-to-terms mapping for analysis
cluster_to_terms = defaultdict(list)
for term, cluster in term_to_cluster.items():
    cluster_to_terms[cluster].append(term)

print(f"\nExample clusters:")
for cluster_id in sorted(cluster_to_terms.keys())[:5]:
    print(f"  Cluster {cluster_id}: {cluster_to_terms[cluster_id]}")

Training data shape: (713, 2)
Number of unique clusters: 299
Number of terms: 713

Example clusters:
  Cluster 0: ['biodegradabili']
  Cluster 1: ['cassonetti', 'contenitori stradali', 'cassonetti stradali', 'postazioni stradali']
  Cluster 2: ['separare i rifiuti']
  Cluster 3: ['deposito', 'conferimento', 'conferimento delle frazioni', 'conferimento dei rifiuti', 'conferimenti', 'operazioni di conferimento']
  Cluster 4: ['conferiti', 'conferito', 'conferita']


## Load Dev Data

In [7]:
# Load dev data
dev_df = pd.read_csv(dev_path)
print(f"Dev data shape: {dev_df.shape}")
print(f"Number of unique clusters in dev: {dev_df['cluster'].nunique()}")
print(f"Number of terms in dev: {len(dev_df)}")

print(f"\nFirst few dev terms:")
print(dev_df.head(10))

Dev data shape: (242, 2)
Number of unique clusters in dev: 147
Number of terms in dev: 242

First few dev terms:
                             term  cluster
0                        deposito        3
1                    conferimento        3
2                       conferiti        4
3                   differenziati        5
4       servizio di igiene urbana        7
5         olio alimentare esausto       10
6                     oli esausti       10
7  operatori dell'isola ecologica       11
8                       pannolini       12
9                       pannoloni       12


## Similarity Functions

Define various similarity metrics for matching terms.

In [14]:
def exact_match_similarity(term1, term2):
    """Returns 1.0 if exact match, 0.0 otherwise."""
    return 1.0 if term1 == term2 else 0.0

def substring_similarity(term1, term2):
    """Returns 0.9 if one term is substring of the other."""
    if term1 in term2 or term2 in term1:
        return 0.9
    return 0.0

def word_overlap_similarity(term1, term2):
    """Calculate Jaccard similarity based on word overlap."""
    # https://www.sciencedirect.com/topics/computer-science/jaccard-similarity
    words1 = set(term1.split())
    words2 = set(term2.split())
    
    if not words1 or not words2:
        return 0.0
    
    intersection = len(words1 & words2)
    union = len(words1 | words2)
    
    return intersection / union if union > 0 else 0.0

def sequence_similarity(term1, term2):
    """Calculate character-level sequence similarity."""
    return SequenceMatcher(None, term1, term2).ratio()

def combined_similarity(term1, term2, word_overlap_weight=0.6, seq_weight=0.4):
    """Combine multiple similarity metrics with weights."""
    exact = exact_match_similarity(term1, term2)
    if exact == 1.0:
        return 1.0
    
    substring = substring_similarity(term1, term2)
    if substring > 0:
        return substring
    
    word_overlap = word_overlap_similarity(term1, term2)
    seq_sim = sequence_similarity(term1, term2)
    
    # Weighted combination
    return word_overlap_weight * word_overlap + seq_weight * seq_sim

## Predict Clusters for Dev Set

For each term in the dev set, find the most similar training term and assign its cluster.

In [9]:
def find_best_cluster(term, term_to_cluster, similarity_threshold=0.3):
    """
    Find the best matching cluster for a term.
    Returns the cluster ID and the similarity score.
    """
    term_lower = term.lower()
    
    # Check for exact match first
    if term_lower in term_to_cluster:
        return term_to_cluster[term_lower], 1.0
    
    # Find most similar term
    best_similarity = 0.0
    best_cluster = None
    
    for train_term, cluster_id in term_to_cluster.items():
        sim = combined_similarity(term_lower, train_term)
        if sim > best_similarity:
            best_similarity = sim
            best_cluster = cluster_id
    
    # If similarity is below threshold, return -1 (unknown cluster)
    if best_similarity < similarity_threshold:
        return -1, best_similarity
    
    return best_cluster, best_similarity

In [10]:
# Predict clusters for dev set
predictions = []
similarity_scores = []

print("Predicting clusters for dev set...")
for idx, row in dev_df.iterrows():
    term = row['term']
    cluster_pred, sim_score = find_best_cluster(term, term_to_cluster)
    predictions.append(cluster_pred)
    similarity_scores.append(sim_score)
    
    if idx < 10:  # Print first 10 predictions
        print(f"Term: '{term}' -> Cluster: {cluster_pred} (similarity: {sim_score:.3f})")

dev_df['predicted_cluster'] = predictions
dev_df['similarity_score'] = similarity_scores

print(f"\nPredictions complete!")
print(f"Terms with exact/high similarity match: {sum(1 for s in similarity_scores if s >= 0.9)}")
print(f"Terms with medium similarity match: {sum(1 for s in similarity_scores if 0.5 <= s < 0.9)}")
print(f"Terms with low similarity match: {sum(1 for s in similarity_scores if 0.3 <= s < 0.5)}")
print(f"Terms with no match (unknown cluster): {sum(1 for p in predictions if p == -1)}")

Predicting clusters for dev set...
Term: 'deposito' -> Cluster: 3 (similarity: 1.000)
Term: 'conferimento' -> Cluster: 3 (similarity: 1.000)
Term: 'conferiti' -> Cluster: 4 (similarity: 1.000)
Term: 'differenziati' -> Cluster: 44 (similarity: 0.900)
Term: 'servizio di igiene urbana' -> Cluster: 7 (similarity: 1.000)
Term: 'olio alimentare esausto' -> Cluster: 10 (similarity: 1.000)
Term: 'oli esausti' -> Cluster: 10 (similarity: 1.000)
Term: 'operatori dell'isola ecologica' -> Cluster: 11 (similarity: 1.000)
Term: 'pannolini' -> Cluster: 12 (similarity: 1.000)
Term: 'pannoloni' -> Cluster: 12 (similarity: 0.900)

Predictions complete!
Terms with exact/high similarity match: 218
Terms with medium similarity match: 3
Terms with low similarity match: 6
Terms with no match (unknown cluster): 15

Predictions complete!
Terms with exact/high similarity match: 218
Terms with medium similarity match: 3
Terms with low similarity match: 6
Terms with no match (unknown cluster): 15


## Save Predictions

In [11]:
# Save predictions
output_path = "predictions/subtask_b_dev_vanilla_preds.csv"
#dev_df[['term', 'predicted_cluster', 'similarity_score']].to_csv(output_path, index=False)
dev_df[['term', 'predicted_cluster']].to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")

Predictions saved to predictions/subtask_b_dev_vanilla_preds.csv


## Evaluation Metrics

Calculate clustering performance metrics.

In [None]:
import numpy as np
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are terms and values are cluster_ids.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    data = {term: int(cluster) for term, cluster in df.itertuples(index=False)}
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {item["term"]: item["cluster"] for item in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

class BCubed_calculator:
  def __init__(self, gold, pred):
    self.gold = gold
    self.pred = pred
    self.gold_cluster = defaultdict(set)
    self.pred_cluster = defaultdict(set)
    for item, clus_id in gold.items():
        self.gold_cluster[clus_id].add(item)
    for item, clus_id in pred.items():
      self.pred_cluster[clus_id].add(item)

  def bc_precision_item(self, item):
    pred_id = self.pred[item]
    gold_id = self.gold.get(item, None)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FP = len(self.pred_cluster[pred_id]) - TP
    return TP/(FP + TP)

  def bc_recall_item(self, item):
    pred_id = self.pred.get(item, None)
    gold_id = self.gold.get(item)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FN = len(self.gold_cluster[gold_id]) - TP
    return TP/(TP + FN)

def bcubed_precision(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_precision_item(item) for item in calc.pred])

def bcubed_recall(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_recall_item(item) for item in calc.gold])

def bcubed_f1(gold, pred):
  return 2 * bcubed_precision(gold, pred) * bcubed_recall(gold, pred) / (bcubed_precision(gold, pred) + bcubed_recall(gold, pred))

In [13]:
preds = load_data("predictions/subtask_b_dev_vanilla_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.6555
BCubed Recall: 0.8703
BCubed F1: 0.7478


## Possible improvements
- Tune the *unknown cluster* threshold
- Tune the *word_overlap_weight* and *seq_weight* weights in combined similarity