In [1]:
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are (doc_id, par_id, sent_id) tuples and values are
  lists of terms.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True) # Fill NaN values with empty strings
    data = defaultdict(list)
    # Iterate over rows and extract terms
    for doc_id, par_id, sent_id, _, term in df.itertuples(index=False):
      if term.strip() != '':
        data[(doc_id, par_id, sent_id)].append(term.strip())
      else:
         data[(doc_id, par_id, sent_id)]
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {(row["document_id"], row["paragraph_id"], row["sentence_id"]): row["term_list"]
            for row in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

In [2]:
def micro_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Precision, Recall,
  and F1 score based on individual term matching (micro-average).

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Precision, Recall, and F1 score.
  """
  total_true_positives = 0
  total_false_positives = 0
  total_false_negatives = 0

  # Iterate through each item's gold standard and system output terms
  for gold, system in zip(gold_standard, system_output):
    # Convert to sets for efficient comparison
    gold_set = set(gold)
    system_set = set(system)

    # Calculate True Positives, False Positives, and False Negatives for the current item
    true_positives = len(gold_set.intersection(system_set))
    false_positives = len(system_set - gold_set)
    false_negatives = len(gold_set - system_set)

    # Accumulate totals across all items
    total_true_positives += true_positives
    total_false_positives += false_positives
    total_false_negatives += false_negatives

  # Calculate Precision, Recall, and F1 score (micro-average)
  precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
  recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1

In [3]:
def type_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Type Precision,
  Type Recall, and Type F1 score based on the set of unique terms extracted
  at least once across the entire dataset.

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Type Precision, Type Recall, and Type F1 score.
  """

  # Get the set of all unique gold standard terms across the dataset
  all_gold_terms = set()
  for item_terms in gold_standard:
    all_gold_terms.update(item_terms)

  # Get the set of all unique system extracted terms across the dataset
  all_system_terms = set()
  for item_terms in system_output:
    all_system_terms.update(item_terms)

  # Calculate True Positives (terms present in both sets)
  type_true_positives = len(all_gold_terms.intersection(all_system_terms))

  # Calculate False Positives (terms in system output but not in gold standard)
  type_false_positives = len(all_system_terms - all_gold_terms)

  # Calculate False Negatives (terms in gold standard but not in system output)
  type_false_negatives = len(all_gold_terms - all_system_terms)

  # Calculate Type Precision, Type Recall, and Type F1 score
  type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
  type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
  type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0

  return type_precision, type_recall, type_f1

In [4]:
# Get file paths for system output and gold standard from user input
SYSTEM_OUTPUT_PATH = "../baseline/baseline_a_1.json"
GOLD_STANDARD_PATH = "../data/subtask_a_dev.json"

In [5]:
# Load data from the specified file paths
system_output_dict = load_data(SYSTEM_OUTPUT_PATH)
gold_standard_dict = load_data(GOLD_STANDARD_PATH)

# Extract the term lists from the loaded data dictionaries
gold_standard = list(gold_standard_dict.values())
system_output = list(system_output_dict.values())

# Calculate and print micro-averaged precision, recall, and F1 score
precision, recall, f1 = micro_f1_score(gold_standard, system_output)
print("micro-averaged Precision:", round(precision, 3))
print("micro-averaged Recall:", round(recall, 3))
print("micro-averaged F1 score:", round(f1, 3))

# Calculate and print Type Precision, Type Recall, and Type F1 score
type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)
print("\nType Precision:", round(type_precision, 3))
print("Type Recall:", round(type_recall, 3))
print("Type F1 score:", round(type_f1, 3))

micro-averaged Precision: 0.455
micro-averaged Recall: 0.623
micro-averaged F1 score: 0.526

Type Precision: 0.369
Type Recall: 0.607
Type F1 score: 0.459


In [7]:
import os
import glob

# Path to predictions directory and baseline
predictions_dir = "../src/predictions/"
baseline_file = "../baseline/baseline_a_1.json"
gold_standard_file = "../data/subtask_a_dev.json"

# Get all JSON files in the predictions directory
prediction_files = glob.glob(os.path.join(predictions_dir, "*.json"))

# Add baseline file to the list
prediction_files.append(baseline_file)

# Load gold standard once
gold_standard_dict = load_data(gold_standard_file)
gold_standard = list(gold_standard_dict.values())

# Store results
results = []

# Evaluate each prediction file
for pred_file in prediction_files:
    if pred_file.find("subtask_a_dev_") == -1 and pred_file != baseline_file:
        continue
    file_name = os.path.basename(pred_file)
    print(f"Evaluating {file_name}...")
    
    try:
        # Load system output
        system_output_dict = load_data(pred_file)
        system_output = list(system_output_dict.values())
        
        # Calculate micro-averaged metrics
        precision, recall, f1 = micro_f1_score(gold_standard, system_output)
        
        # Calculate type metrics
        type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)
        
        # Store results with cleaned file name
        clean_name = file_name.replace("subtask_a_dev_", "").replace("_preds", "").replace(".json", "")
        results.append({
            'file_name': clean_name,
            'micro_precision': round(precision, 3),
            'micro_recall': round(recall, 3),
            'micro_f1': round(f1, 3),
            'type_precision': round(type_precision, 3),
            'type_recall': round(type_recall, 3),
            'type_f1': round(type_f1, 3)
        })
    except Exception as e:
        print(f"Error evaluating {file_name}: {e}")

# Create DataFrame and sort by micro F1 score (descending)
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('micro_f1', ascending=False).reset_index(drop=True)

# Display the table
print("\n" + "="*100)
print("EVALUATION RESULTS")
print("="*100)
print(results_df.to_string(index=False))

# Save to CSV
output_csv = "subtask_a_evaluation_results.csv"
results_df.to_csv(output_csv, index=False)
print(f"\nResults saved to {output_csv}")

Evaluating subtask_a_dev_bert_token_classification_preds.json...
Evaluating subtask_a_dev_dspy_baseline_preds.json...
Evaluating subtask_a_dev_dspy_optimized_preds.json...
Evaluating subtask_a_dev_gemma3_4B_finetuned_preds.json...
Evaluating subtask_a_dev_llm_zero_shot_preds.json...
Evaluating subtask_a_dev_nltk_baseline_preds.json...
Evaluating subtask_a_dev_nltk_trained_preds.json...
Evaluating subtask_a_dev_spacy_baseline_preds.json...
Evaluating subtask_a_dev_spacy_trained_preds.json...
Evaluating subtask_a_dev_vanilla_preds.json...
Evaluating baseline_a_1.json...

EVALUATION RESULTS
                file_name  micro_precision  micro_recall  micro_f1  type_precision  type_recall  type_f1
bert_token_classification            0.655         0.652     0.653           0.566        0.566    0.566
      gemma3_4B_finetuned            0.616         0.599     0.607           0.525        0.554    0.539
                  vanilla            0.428         0.738     0.542           0.657        