In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [33]:
# for both txt, every line is (text, pred label, gold label)

f = open("test_bert_prediction.txt", "r")
lines = f.readlines()

gold = []
bert = []

for line in lines:
    tmp = line.split()
    if len(tmp) == 3:
        gold.append(tmp[2])
        bert.append(tmp[1])

In [34]:
f = open("test_bilstm_prediction.txt", "r")
lines = f.readlines()

lstm = []

for line in lines:
    tmp = line.split()
    if len(tmp) == 3:
        lstm.append(tmp[1])

In [36]:
import numpy as np

rng = np.random.default_rng()


def eval_with_paired_bootstrap(gold, sys1, sys2, num_samples=10000, sample_ratio=0.5):
    """Evaluate with paired boostrap
    This compares two systems, performing a significance tests with
    paired bootstrap resampling to compare the accuracy of the two systems.

    Parameters
    ----------
    gold
      The correct labels
    sys1
      The output of system 1
    sys2
      The output of system 2
    num_samples
      The number of bootstrap samples to take
    sample_ratio
      The ratio of samples to take every time

    """

    assert len(gold) == len(sys1)
    assert len(gold) == len(sys2)

    gold = np.array(gold)
    sys1 = np.array(sys1)
    sys2 = np.array(sys2)

    sys1_scores = []
    sys2_scores = []
    wins = [0, 0, 0]
    n = len(gold)

    for _ in range(num_samples):
        # Subsample the gold and system outputs
        subset_idxs = rng.choice(n, int(n * sample_ratio), replace=True)
        sys1_score = (sys1[subset_idxs] == gold[subset_idxs]).mean()
        sys2_score = (sys2[subset_idxs] == gold[subset_idxs]).mean()

        if sys1_score > sys2_score:
            wins[0] += 1
        elif sys1_score < sys2_score:
            wins[1] += 1
        else:
            wins[2] += 1

        sys1_scores.append(sys1_score)
        sys2_scores.append(sys2_score)

    # Print win stats
    wins = [x / float(num_samples) for x in wins]
    print("Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f" % (wins[0], wins[1], wins[2]))
    if wins[0] > wins[1]:
        print("(sys1 is superior with p value p=%.3f)\n" % (1 - wins[0]))
    elif wins[1] > wins[0]:
        print("(sys2 is superior with p value p=%.3f)\n" % (1 - wins[1]))

    # Print system stats
    sys1_scores.sort()
    sys2_scores.sort()
    print(
        "sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            np.mean(sys1_scores),
            np.median(sys1_scores),
            sys1_scores[int(num_samples * 0.025)],
            sys1_scores[int(num_samples * 0.975)],
        )
    )
    print(
        "sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            np.mean(sys2_scores),
            np.median(sys2_scores),
            sys2_scores[int(num_samples * 0.025)],
            sys2_scores[int(num_samples * 0.975)],
        )
    )

In [38]:
eval_with_paired_bootstrap(gold, bert, lstm)

Win ratio: sys1=1.000, sys2=0.000, tie=0.000
(sys1 is superior with p value p=0.000)

sys1 mean=0.996, median=0.996, 95% confidence interval=[0.994, 0.997]
sys2 mean=0.954, median=0.954, 95% confidence interval=[0.950, 0.959]
