# MockConf statistics

In [1]:
import sys
sys.path.append("../scripts/")

import os
import json
import tools
import evaluation

from constants import DEVSET_FILE_NAMES, TESTSET_FILE_NAMES, DOUBLE_ANNOTATION_FILE_NAME

# Loading Data

In [2]:
# Function to load alignments and split them into dev and test sets
def load_and_split_alignments(
    description: str, path: str, test_files: list[str], dev_files: list[str]
) -> tuple[dict, dict]:
    print(f"→ Loading {description} from {path}")
    alignments = tools.read_alignment_files(path)
    devset, testset = tools.make_split(test_files, dev_files, alignments)
    return devset, testset


# === Reference Alignments ===
reference_alignments_devset, reference_alignments_testset = load_and_split_alignments(
    "reference data", "../data/alignments/one-annotation/", TESTSET_FILE_NAMES, DEVSET_FILE_NAMES
)

# === Random Baselines ===
print("→ Creating random baseline alignments")
random_alignments_testset = tools.create_random_baseline(reference_alignments_testset)
random_alignments_devset = tools.create_random_baseline(reference_alignments_devset)

# === BERTAlign Full Sentences ===
bertalign_alignments_devset, bertalign_alignments_testset = load_and_split_alignments(
    "BERTAlign sentence alignment", "../outputs/bertalign-sentence-alignment/", TESTSET_FILE_NAMES, DEVSET_FILE_NAMES
)

# === BERTAlign Subsegments ===
bertalign_subsegments_alignments_devset, bertalign_subsegments_alignments_testset = load_and_split_alignments(
    "BERTAlign subsegments", "../outputs/bertalign-sentence-alignment-subsegments/", TESTSET_FILE_NAMES, DEVSET_FILE_NAMES
)

# === BERTAlign Subsegments with Labels ===
bertalign_alignments_subsegments_labels_devset, bertalign_alignments_subsegments_labels_testset = load_and_split_alignments(
    "BERTAlign subsegments + labels", "../outputs/bertalign-sentence-alignment-subsegments-labels-200/", TESTSET_FILE_NAMES, DEVSET_FILE_NAMES
)


→ Loading reference data from ../data/alignments/one-annotation/
→ Creating random baseline alignments
→ Loading BERTAlign sentence alignment from ../outputs/bertalign-sentence-alignment/
→ Loading BERTAlign subsegments from ../outputs/bertalign-sentence-alignment-subsegments/
→ Loading BERTAlign subsegments + labels from ../outputs/bertalign-sentence-alignment-subsegments-labels-200/


# Final Evaluation

In [3]:
def get_single_alignment(alignment_dict: dict, file_name: str) -> dict:
    return {file_name: alignment_dict[file_name]}


# Load double annotation data
annotation_1_all = tools.read_alignment_files("../data/alignments/double-annotation/annotator-1/")
annotation_2_all = tools.read_alignment_files("../data/alignments/double-annotation/annotator-2/")

annotation_1 = get_single_alignment(annotation_1_all, DOUBLE_ANNOTATION_FILE_NAME)
annotation_2 = get_single_alignment(annotation_2_all, DOUBLE_ANNOTATION_FILE_NAME)

# Load system outputs for that file
random_alignment = get_single_alignment(random_alignments_devset, DOUBLE_ANNOTATION_FILE_NAME)
bertalign_alignment = get_single_alignment(bertalign_alignments_devset, DOUBLE_ANNOTATION_FILE_NAME)
bertalign_subsegments_alignment = get_single_alignment(bertalign_subsegments_alignments_devset, DOUBLE_ANNOTATION_FILE_NAME)
bertalign_alignments_subsegments_labels_alignment = get_single_alignment(bertalign_alignments_subsegments_labels_devset, DOUBLE_ANNOTATION_FILE_NAME)


In [4]:
import numpy as np

def append_segmentation_rows(table, pairs):
    for i, (hyp, ref, _) in enumerate(pairs):
        _, precision, recall, f1, wd, pk, _ = evaluation.evaluate_segmentation_boundaries(hyp, ref, aggregate=True)
        table[i].extend([precision, recall, f1, wd, pk])
    return table

def append_span_alignment_rows(table, pairs):
    for i, (hyp, ref, system_name) in enumerate(pairs):
        table[i].extend(evaluation.evaluate_segmentation_word_pairs(hyp, ref)[:-1])
        table[i].append(evaluation.evaluate_exact_match(hyp, ref, with_labels=True))
        table[i].append(evaluation.evaluate_exact_match(hyp, ref, with_labels=False))
        is_random = r"\BaselineName{}" in system_name
        input_path = "../outputs/word-align-baseline"
        table[i].extend(evaluation.get_word_alignment_eval(hyp, ref, input_path, is_random=is_random))
    return table

def append_label_match_rows(table, pairs):
    for i, (hyp, ref, _) in enumerate(pairs):
        acc, _, _, f1, _ = evaluation.evaluate_span_labels(hyp, ref, aggregate=True)
        table[i].extend([acc, f1])
    return table

def append_seg_count(table, pairs):
    for i, (alignment, _, _) in enumerate(pairs):
        src_counts = [tools.count_segments(doc["alignedPairs"], "source") for doc in alignment.values()]
        tgt_counts = [tools.count_segments(doc["alignedPairs"], "target") for doc in alignment.values()]
        table[i].extend([np.mean(src_counts), np.mean(tgt_counts)])
    return table

def append_all_rows(table, pairs):
    append_segmentation_rows(table, pairs)
    append_span_alignment_rows(table, pairs)
    append_label_match_rows(table, pairs)
    append_seg_count(table, pairs)
    return table

def format_table_row(index, row, system_name):
    prefix = ""
    if index == 0:
        prefix = r"\parbox[t]{2mm}{\multirow{8}{*}{\rotatebox[origin=c]{90}{1 recording}}}"
    elif index == 10:
        prefix = r"\parbox[t]{2mm}{\multirow{4}{*}{\rotatebox[origin=c]{90}{devset}}}"
    elif index == 14:
        prefix = r"\parbox[t]{2mm}{\multirow{4}{*}{\rotatebox[origin=c]{90}{testset}}}"

    suffix = r"\\"
    if index in [1, 3, 5, 7, 8]:
        suffix = r"\\ \cdashline{2-18}"
    elif index in [9, 13]:
        suffix = r"\\ \hline"

    formatted_values = " & ".join(f"{val:.2f}" for val in row[:-2])
    segment_counts = " & ".join(f"{val:.0f}" for val in row[-2:])
    return f"{prefix} & {system_name} & {formatted_values} & {segment_counts}{suffix}"

# Prepare evaluation input
pairs = [
    (random_alignment, annotation_1, r"\BaselineName{}$_2$"),
    (random_alignment, annotation_2, r"\BaselineName{}$_3$"),
    (bertalign_alignment, annotation_1, r"\SystemName{}$_2$"),
    (bertalign_alignment, annotation_2, r"\SystemName{}$_3$"),
    (bertalign_subsegments_alignment, annotation_1, r"\SystemSubName{}$_2$"),
    (bertalign_subsegments_alignment, annotation_2, r"\SystemSubName{}$_3$"),
    (bertalign_alignments_subsegments_labels_alignment, annotation_1, r"\SystemLabName{}$_2$"),
    (bertalign_alignments_subsegments_labels_alignment, annotation_2, r"\SystemLabName{}$_3$"),
    (annotation_2, annotation_1, "An3$_2$"),
    (annotation_1, annotation_2, "An2$_3$"),
    (random_alignments_devset, reference_alignments_devset, r"\BaselineName{}"),
    (bertalign_alignments_devset, reference_alignments_devset, r"\SystemName{}"),
    (bertalign_subsegments_alignments_devset, reference_alignments_devset, r"\SystemSubName{}"),
    (bertalign_alignments_subsegments_labels_devset, reference_alignments_devset, r"\SystemLabName{}"),
    (random_alignments_testset, reference_alignments_testset, r"\BaselineName{}"),
    (bertalign_alignments_testset, reference_alignments_testset, r"\SystemName{}"),
    (bertalign_subsegments_alignments_testset, reference_alignments_testset, r"\SystemSubName{}"),
    (bertalign_alignments_subsegments_labels_testset, reference_alignments_testset, r"\SystemLabName{}"),
]

# Run all evaluations
table = [[] for _ in range(len(pairs))]
table = append_all_rows(table, pairs)

# Print formatted LaTeX table rows
for i, row in enumerate(table):
    print(format_table_row(i, row, pairs[i][2]))


\parbox[t]{2mm}{\multirow{8}{*}{\rotatebox[origin=c]{90}{1 recording}}} & \BaselineName{}$_2$ & 16.18 & 16.18 & 16.18 & 0.50 & 0.47 & 0.13 & 0.09 & 0.10 & 0.00 & 0.00 & 0.64 & 0.36 & 55.37 & 55.18 & 146 & 124\\
 & \BaselineName{}$_3$ & 22.79 & 17.51 & 19.81 & 0.52 & 0.47 & 0.11 & 0.08 & 0.09 & 0.00 & 0.00 & 0.63 & 0.36 & 38.74 & 31.28 & 146 & 124\\ \cdashline{2-18}
 & \SystemName{}$_2$ & 98.23 & 40.81 & 57.66 & 0.23 & 0.22 & 0.42 & 0.99 & 0.59 & 10.32 & 14.19 & 0.30 & 0.71 & 75.26 & 66.17 & 58 & 53\\
 & \SystemName{}$_3$ & 99.12 & 31.64 & 47.97 & 0.33 & 0.30 & 0.37 & 1.00 & 0.54 & 2.90 & 10.14 & 0.37 & 0.65 & 49.16 & 34.27 & 58 & 53\\ \cdashline{2-18}
 & \SystemSubName{}$_2$ & 85.80 & 53.31 & 65.76 & 0.21 & 0.20 & 0.52 & 0.79 & 0.63 & 15.48 & 18.06 & 0.35 & 0.65 & 75.26 & 66.17 & 89 & 78\\
 & \SystemSubName{}$_3$ & 83.43 & 39.83 & 53.92 & 0.32 & 0.29 & 0.46 & 0.80 & 0.58 & 4.35 & 11.59 & 0.41 & 0.60 & 49.16 & 34.27 & 89 & 78\\ \cdashline{2-18}
 & \SystemLabName{}$_2$ & 85.80 & 53.31 & 