# NLTK Term Clustering

This notebook demonstrates two approaches to term clustering using NLTK:
1. **Baseline**: String similarity and fuzzy matching
2. **Trained**: TF-IDF based similarity clustering

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed) - Subtask B

## Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import math
import difflib
import re
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

import nltk
from nltk import word_tokenize
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, v_measure_score
from sklearn.metrics import homogeneity_score, completeness_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Download required NLTK data
nltk.download('punkt', quiet=True)
print("Setup complete")

Setup complete


## Load Training and Dev Data

In [4]:
data_path = "../data"
train_path = os.path.join(data_path, "subtask_b_train.csv")
dev_path = os.path.join(data_path, "subtask_b_dev.csv")

# Load training data
train_df = pd.read_csv(train_path)
print(f"Training data shape: {train_df.shape}")
print(f"Number of unique clusters: {train_df['cluster'].nunique()}")
print(f"Number of terms: {len(train_df)}")

# Load dev data
dev_df = pd.read_csv(dev_path)
print(f"\nDev data shape: {dev_df.shape}")
print(f"Number of unique clusters in dev: {dev_df['cluster'].nunique()}")
print(f"Number of terms in dev: {len(dev_df)}")

# Create term-to-cluster mapping
term_to_cluster = dict(zip(train_df['term'].str.lower(), train_df['cluster']))

# Create cluster-to-terms mapping for analysis
cluster_to_terms = defaultdict(list)
for term, cluster in term_to_cluster.items():
    cluster_to_terms[cluster].append(term)

print(f"\nExample clusters from training:")
for cluster_id in sorted(cluster_to_terms.keys())[:5]:
    print(f"  Cluster {cluster_id}: {cluster_to_terms[cluster_id]}")

Training data shape: (713, 2)
Number of unique clusters: 299
Number of terms: 713

Dev data shape: (242, 2)
Number of unique clusters in dev: 147
Number of terms in dev: 242

Example clusters from training:
  Cluster 0: ['biodegradabili']
  Cluster 1: ['cassonetti', 'contenitori stradali', 'cassonetti stradali', 'postazioni stradali']
  Cluster 2: ['separare i rifiuti']
  Cluster 3: ['deposito', 'conferimento', 'conferimento delle frazioni', 'conferimento dei rifiuti', 'conferimenti', 'operazioni di conferimento']
  Cluster 4: ['conferiti', 'conferito', 'conferita']


## Text Normalization Utilities

In [3]:
def normalize_text(text: str) -> str:
    """Lowercase, remove punctuation, normalize whitespace."""
    text = text.lower()
    text = re.sub(r"[^\w\sÀ-ÖØ-öø-ÿ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def tokenize_text(text: str) -> List[str]:
    """Tokenize text using NLTK."""
    try:
        return word_tokenize(text, language='italian')
    except:
        return text.split()


# Test
test_text = "La raccolta differenziata è importante!"
print(f"Original: {test_text}")
print(f"Normalized: {normalize_text(test_text)}")
print(f"Tokens: {tokenize_text(normalize_text(test_text))}")

Original: La raccolta differenziata è importante!
Normalized: la raccolta differenziata è importante
Tokens: ['la', 'raccolta', 'differenziata', 'è', 'importante']


## Baseline: String Similarity Clustering

Uses fuzzy string matching to find the most similar training term for each dev term.

In [14]:
class NLTKBaselineClustering:
    """Baseline clustering using string similarity."""
    
    def __init__(self, similarity_threshold=0.6):
        self.similarity_threshold = similarity_threshold
        self.term_to_cluster = {}
        self.normalized_terms = {}
    
    def fit(self, terms: List[str], clusters: List[int]):
        """Build term-cluster mapping."""
        print("Building term index...")
        for term, cluster in tqdm(zip(terms, clusters), total=len(terms)):
            term_lower = term.lower()
            norm_term = normalize_text(term_lower)
            self.term_to_cluster[term_lower] = cluster
            self.normalized_terms[term_lower] = norm_term
        
        print(f"Built index with {len(self.term_to_cluster)} terms")
    
    def _compute_similarity(self, term1: str, term2: str) -> float:
        """Compute string similarity using multiple methods."""
        norm1 = normalize_text(term1.lower())
        norm2 = normalize_text(term2.lower())
        
        # Exact match
        if norm1 == norm2:
            return 1.0
        
        # Substring match
        if norm1 in norm2 or norm2 in norm1:
            return 0.9
        
        # Token overlap (Jaccard)
        tokens1 = set(norm1.split())
        tokens2 = set(norm2.split())
        if tokens1 and tokens2:
            jaccard = len(tokens1 & tokens2) / len(tokens1 | tokens2)
        else:
            jaccard = 0.0
        
        # Sequence similarity (difflib)
        seq_sim = difflib.SequenceMatcher(None, norm1, norm2).ratio()
        
        # Combined score
        return 0.6 * jaccard + 0.4 * seq_sim
    
    def predict_one(self, term: str) -> Tuple[int, float]:
        """Predict cluster for a single term."""
        term_lower = term.lower()
        
        # Check for exact match
        if term_lower in self.term_to_cluster:
            return self.term_to_cluster[term_lower], 1.0
        
        # Find most similar term
        best_similarity = 0.0
        best_cluster = -1
        
        for train_term in self.term_to_cluster.keys():
            sim = self._compute_similarity(term_lower, train_term)
            if sim > best_similarity:
                best_similarity = sim
                best_cluster = self.term_to_cluster[train_term]
        
        # Check threshold
        if best_similarity < self.similarity_threshold:
            return -1, best_similarity
        
        return best_cluster, best_similarity
    
    def predict(self, terms: List[str]) -> Tuple[List[int], List[float]]:
        """Predict clusters for multiple terms."""
        predictions = []
        similarities = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            cluster, sim = self.predict_one(term)
            predictions.append(cluster)
            similarities.append(sim)
        
        return predictions, similarities
    
    def save(self, path: str):
        """Save model to disk."""
        os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
        model_data = {
            'similarity_threshold': self.similarity_threshold,
            'term_to_cluster': self.term_to_cluster,
            'normalized_terms': self.normalized_terms
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load model from disk."""
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        self.similarity_threshold = model_data['similarity_threshold']
        self.term_to_cluster = model_data['term_to_cluster']
        self.normalized_terms = model_data['normalized_terms']
        print(f"Model loaded from {path}")


# Test
test_model = NLTKBaselineClustering(similarity_threshold=0.5)
test_terms_list = ['rifiuti', 'carta', 'plastica']
test_clusters_list = [37, 74, 43]
test_model.fit(test_terms_list, test_clusters_list)

pred_cluster, pred_sim = test_model.predict_one('spazzatura')
print(f"\n Baseline model works!!")
print(f"  'spazzatura' -> Cluster {pred_cluster} (similarity: {pred_sim:.3f})")

#pred_cluster, pred_sim = test_model.predict_one('immondizia')
#print(f"\n Baseline model works!!")
#print(f"  'immondizia' -> Cluster {pred_cluster} (similarity: {pred_sim:.3f})")

Building term index...


100%|██████████| 3/3 [00:00<?, ?it/s]

Built index with 3 terms

 Baseline model works!!
  'spazzatura' -> Cluster -1 (similarity: 0.107)





### Train and Evaluate Baseline Model

In [15]:
# Initialize baseline model
baseline_model = NLTKBaselineClustering(similarity_threshold=0.6)

# Fit on training data
baseline_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
baseline_preds, baseline_sims = baseline_model.predict(dev_df['term'].tolist())

# Store results
dev_df['baseline_cluster'] = baseline_preds
dev_df['baseline_similarity'] = baseline_sims

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(baseline_preds)}")
print(f"  Terms with exact match: {sum(1 for s in baseline_sims if s >= 0.99)}")
print(f"  Terms with high similarity (≥0.8): {sum(1 for s in baseline_sims if 0.8 <= s < 0.99)}")
print(f"  Terms with medium similarity (0.6-0.8): {sum(1 for s in baseline_sims if 0.6 <= s < 0.8)}")
print(f"  Terms below threshold (unknown): {sum(1 for p in baseline_preds if p == -1)}")

Building term index...


100%|██████████| 713/713 [00:00<00:00, 356738.49it/s]
100%|██████████| 713/713 [00:00<00:00, 356738.49it/s]


Built index with 713 terms


Predicting clusters: 100%|██████████| 242/242 [00:02<00:00, 97.55it/s] 


Prediction statistics:
  Total terms: 242
  Terms with exact match: 140
  Terms with high similarity (≥0.8): 78
  Terms with medium similarity (0.6-0.8): 1
  Terms below threshold (unknown): 23





## Save Models and Predictions

In [19]:
# Save baseline model
baseline_model.save('models/nltk_clustering_baseline.pkl')

# Save predictions
output_path_baseline = "predictions/subtask_b_dev_nltk_baseline_preds.csv"
#dev_df[['term', 'baseline_cluster', 'baseline_similarity']].to_csv(output_path_baseline, index=False)
dev_df[['term', 'baseline_cluster']].to_csv(output_path_baseline, index=False)
print(f"\nBaseline predictions saved to {output_path_baseline}")

Model saved to models/nltk_clustering_baseline.pkl

Baseline predictions saved to predictions/subtask_b_dev_nltk_baseline_preds.csv


### Evaluate Baseline Model

In [17]:
import numpy as np
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are terms and values are cluster_ids.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    data = {term: int(cluster) for term, cluster in df.itertuples(index=False)}
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {item["term"]: item["cluster"] for item in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

class BCubed_calculator:
  def __init__(self, gold, pred):
    self.gold = gold
    self.pred = pred
    self.gold_cluster = defaultdict(set)
    self.pred_cluster = defaultdict(set)
    for item, clus_id in gold.items():
        self.gold_cluster[clus_id].add(item)
    for item, clus_id in pred.items():
      self.pred_cluster[clus_id].add(item)

  def bc_precision_item(self, item):
    pred_id = self.pred[item]
    gold_id = self.gold.get(item, None)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FP = len(self.pred_cluster[pred_id]) - TP
    return TP/(FP + TP)

  def bc_recall_item(self, item):
    pred_id = self.pred.get(item, None)
    gold_id = self.gold.get(item)
    TP = len(self.pred_cluster[pred_id].intersection(self.gold_cluster[gold_id]))
    FN = len(self.gold_cluster[gold_id]) - TP
    return TP/(TP + FN)

def bcubed_precision(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_precision_item(item) for item in calc.pred])

def bcubed_recall(gold, pred):
  calc = BCubed_calculator(gold, pred)
  return np.average([calc.bc_recall_item(item) for item in calc.gold])

def bcubed_f1(gold, pred):
  return 2 * bcubed_precision(gold, pred) * bcubed_recall(gold, pred) / (bcubed_precision(gold, pred) + bcubed_recall(gold, pred))

In [20]:
preds = load_data("predictions/subtask_b_dev_nltk_baseline_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.6341
BCubed Recall: 0.8607
BCubed F1: 0.7302


## TF-IDF Based Clustering

Uses TF-IDF vectorization to compute semantic similarity between terms.

In [22]:
class NLTKTfidfClustering:
    """TF-IDF based clustering."""
    
    def __init__(self, similarity_threshold=0.3):
        self.similarity_threshold = similarity_threshold
        self.vectorizer = None
        self.term_vectors = None
        self.term_to_cluster = {}
        self.train_terms = []
    
    def fit(self, terms: List[str], clusters: List[int]):
        """Build TF-IDF vectors for training terms."""
        print("Building TF-IDF vectors...")
        
        # Normalize and store terms
        self.train_terms = [t.lower() for t in terms]
        normalized_terms = [normalize_text(t) for t in self.train_terms]
        
        # Build term-cluster mapping
        for term, cluster in zip(self.train_terms, clusters):
            self.term_to_cluster[term] = cluster
        
        # Create TF-IDF vectors
        # Use character n-grams to capture subword information
        self.vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 4),
            lowercase=True,
            max_features=5000
        )
        
        self.term_vectors = self.vectorizer.fit_transform(normalized_terms)
        
        print(f"Built TF-IDF index with {len(self.train_terms)} terms")
        print(f"Vector shape: {self.term_vectors.shape}")
    
    def predict_one(self, term: str) -> Tuple[int, float]:
        """Predict cluster for a single term."""
        term_lower = term.lower()
        
        # Check for exact match
        if term_lower in self.term_to_cluster:
            return self.term_to_cluster[term_lower], 1.0
        
        # Compute TF-IDF vector for query term
        norm_term = normalize_text(term_lower)
        query_vec = self.vectorizer.transform([norm_term])
        
        # Compute cosine similarity with all training terms
        similarities = cosine_similarity(query_vec, self.term_vectors)[0]
        
        # Find most similar term
        best_idx = np.argmax(similarities)
        best_similarity = similarities[best_idx]
        
        # Check threshold
        if best_similarity < self.similarity_threshold:
            return -1, best_similarity
        
        best_term = self.train_terms[best_idx]
        best_cluster = self.term_to_cluster[best_term]
        
        return best_cluster, best_similarity
    
    def predict(self, terms: List[str]) -> Tuple[List[int], List[float]]:
        """Predict clusters for multiple terms."""
        predictions = []
        similarities = []
        
        for term in tqdm(terms, desc="Predicting clusters"):
            cluster, sim = self.predict_one(term)
            predictions.append(cluster)
            similarities.append(sim)
        
        return predictions, similarities
    
    def save(self, path: str):
        """Save model to disk."""
        os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
        model_data = {
            'similarity_threshold': self.similarity_threshold,
            'vectorizer': self.vectorizer,
            'term_vectors': self.term_vectors,
            'term_to_cluster': self.term_to_cluster,
            'train_terms': self.train_terms
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load model from disk."""
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        self.similarity_threshold = model_data['similarity_threshold']
        self.vectorizer = model_data['vectorizer']
        self.term_vectors = model_data['term_vectors']
        self.term_to_cluster = model_data['term_to_cluster']
        self.train_terms = model_data['train_terms']
        print(f"Model loaded from {path}")


# Test
test_tfidf = NLTKTfidfClustering(similarity_threshold=0.1)
test_tfidf.fit(['rifiuti', 'carta', 'plastica'], [37, 74, 43])
pred_cluster, pred_sim = test_tfidf.predict_one('spazzatura')
print(f"\n TF-IDF model works!!")
print(f"  'spazzatura' -> Cluster {pred_cluster} (similarity: {pred_sim:.3f})")

Building TF-IDF vectors...
Built TF-IDF index with 3 terms
Vector shape: (3, 40)

 TF-IDF model works!!
  'spazzatura' -> Cluster -1 (similarity: 0.000)


### Train and Evaluate TF-IDF Model

In [23]:
# Initialize TF-IDF model
tfidf_model = NLTKTfidfClustering(similarity_threshold=0.3)

# Fit on training data
tfidf_model.fit(train_df['term'].tolist(), train_df['cluster'].tolist())

# Predict on dev set
tfidf_preds, tfidf_sims = tfidf_model.predict(dev_df['term'].tolist())

# Store results
dev_df['tfidf_cluster'] = tfidf_preds
dev_df['tfidf_similarity'] = tfidf_sims

print(f"\nPrediction statistics:")
print(f"  Total terms: {len(tfidf_preds)}")
print(f"  Terms with exact match: {sum(1 for s in tfidf_sims if s >= 0.99)}")
print(f"  Terms with high similarity (≥0.6): {sum(1 for s in tfidf_sims if 0.6 <= s < 0.99)}")
print(f"  Terms with medium similarity (0.3-0.6): {sum(1 for s in tfidf_sims if 0.3 <= s < 0.6)}")
print(f"  Terms below threshold (unknown): {sum(1 for p in tfidf_preds if p == -1)}")

Building TF-IDF vectors...
Built TF-IDF index with 713 terms
Vector shape: (713, 4146)


Predicting clusters: 100%|██████████| 242/242 [00:00<00:00, 2024.76it/s]


Prediction statistics:
  Total terms: 242
  Terms with exact match: 140
  Terms with high similarity (≥0.6): 58
  Terms with medium similarity (0.3-0.6): 30
  Terms below threshold (unknown): 14





## Save Models and Predictions

In [None]:
# Save TF-IDF model
tfidf_model.save('models/nltk_clustering_tfidf.pkl')

# Save predictions
output_path_tfidf = "predictions/subtask_b_dev_nltk_tfidf_preds.csv"
#dev_df[['term', 'tfidf_cluster', 'tfidf_similarity']].to_csv(output_path_tfidf, index=False)
dev_df[['term', 'tfidf_cluster']].to_csv(output_path_tfidf, index=False)
print(f"TF-IDF predictions saved to {output_path_tfidf}")

Model saved to models/nltk_clustering_baseline.pkl
Model saved to models/nltk_clustering_tfidf.pkl

Baseline predictions saved to predictions/subtask_b_dev_nltk_baseline_preds.csv
TF-IDF predictions saved to predictions/subtask_b_dev_nltk_tfidf_preds.csv


### Evaluate TF-IDF Model

In [25]:
preds = load_data("predictions/subtask_b_dev_nltk_tfidf_preds.csv")
gold = load_data("../data/subtask_b_dev.csv")
print(f"BCubed Precision: {bcubed_precision(gold, preds):.4f}")
print(f"BCubed Recall: {bcubed_recall(gold, preds):.4f}")
print(f"BCubed F1: {bcubed_f1(gold, preds):.4f}")

BCubed Precision: 0.8002
BCubed Recall: 0.9406
BCubed F1: 0.8647


## Model Comparison

In [26]:
# Load predictions and gold standard for both models
gold = load_data("../data/subtask_b_dev.csv")
baseline_preds = load_data("predictions/subtask_b_dev_nltk_baseline_preds.csv")
tfidf_preds = load_data("predictions/subtask_b_dev_nltk_tfidf_preds.csv")

# Compute BCubed metrics for baseline model
baseline_precision = bcubed_precision(gold, baseline_preds)
baseline_recall = bcubed_recall(gold, baseline_preds)
baseline_f1 = bcubed_f1(gold, baseline_preds)

# Compute BCubed metrics for TF-IDF model
tfidf_precision = bcubed_precision(gold, tfidf_preds)
tfidf_recall = bcubed_recall(gold, tfidf_preds)
tfidf_f1 = bcubed_f1(gold, tfidf_preds)

# Create comparison dataframe
comparison_df = pd.DataFrame([
    {
        'Model': 'Baseline (Similarity)',
        'BCubed Precision': baseline_precision,
        'BCubed Recall': baseline_recall,
        'BCubed F1': baseline_f1
    },
    {
        'Model': 'TF-IDF Similarity',
        'BCubed Precision': tfidf_precision,
        'BCubed Recall': tfidf_recall,
        'BCubed F1': tfidf_f1
    }
])

print("\n" + "="*80)
print("MODEL COMPARISON - BCubed Metrics")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Calculate improvements
precision_improvement = (tfidf_precision - baseline_precision) / baseline_precision * 100 if baseline_precision > 0 else 0
recall_improvement = (tfidf_recall - baseline_recall) / baseline_recall * 100 if baseline_recall > 0 else 0
f1_improvement = (tfidf_f1 - baseline_f1) / baseline_f1 * 100 if baseline_f1 > 0 else 0

print(f"\nTF-IDF vs Baseline:")
print(f"  BCubed Precision improvement: {precision_improvement:+.1f}%")
print(f"  BCubed Recall improvement: {recall_improvement:+.1f}%")
print(f"  BCubed F1 improvement: {f1_improvement:+.1f}%")


MODEL COMPARISON - BCubed Metrics
                Model  BCubed Precision  BCubed Recall  BCubed F1
Baseline (Similarity)          0.634102       0.860685   0.730220
    TF-IDF Similarity          0.800207       0.940575   0.864731

TF-IDF vs Baseline:
  BCubed Precision improvement: +26.2%
  BCubed Recall improvement: +9.3%
  BCubed F1 improvement: +18.4%
