In [1]:
from test_eval import *
from tqdm import tqdm

In [2]:
def load_idxs(path='ids.txt'):
    tests = []
    with open(path, "r") as f:
        for line in f:
            line = line.split("\t")
            test = [int(i) for i in line]
            tests.append(test)
    return tests

def gen_random_idxs(doc_ids, times=1000, dump_to='ids.txt'):
    import random
    with open(dump_to, "w") as f:
        while times > 0:
            test_docs = random.choices(doc_ids, k=len(doc_ids))
            output = "\t".join([str(i) for i in test_docs]) + "\n"
            f.write(output)
            times -= 1

In [3]:
BERT_result_prefix = "../../exp/result_bert_base_uncased_5_15/15"
datasets = ["billsum", "scientific_papers", "cnn_dailymail", "big_patent"]
tac_json_file = "TAC2010_all.json"
human_only="machine"
level = "summary"
method = "pref_ordered"
tac_scores = load_tac_json(tac_json_file, human_only)
tac_json = json.load(open(tac_json_file))
baselines_score_path = "baselines_ref_free_old.json"
baseline_json = json.load(open(baselines_score_path))
aspect = 2

num_docs = 46

if not os.path.exists("ids.txt"):
    gen_random_idxs(list(range(num_docs)))

tests = load_idxs()

In [4]:
def load_baseline_results(tac_json, baseline_json, baseline, summarizer_type):
    score_sorted = []
    for docset in tac_json.keys():
        for summarizer in tac_json[docset]["summaries"].keys():
            if (summarizer_type=="machine" and summarizer.isnumeric()) or \
               (summarizer_type=="human" and summarizer.isalpha()) or \
               (summarizer_type=="both") : # this condition is redundant but to be safe
                score = baseline_json[".".join([docset, summarizer])][baseline]
                if score is None: score = 0.0
                score_sorted.append(score)

    return score_sorted

In [5]:
targetA = [(train, method) for train in datasets]
targetB = ['blanc', 'supert', 'summaqa_avg_fscore', 'summaqa_avg_prob']

In [6]:
def sig_test():
    f = open("sig_{}.tsv".format(aspect), "w", encoding="utf-8")
    for sysA in targetA:
        scoreA = read_tac_test_result(os.path.join(BERT_result_prefix, sysA[0], sysA[1], "test_results_tac.tsv"), tac_json_file, human_only)
        f.write(sysA[0] + "\t")
        for sysB in targetB:
            print(sysA, sysB)
            scoreB = load_baseline_results(tac_json, baseline_json, sysB, human_only)
            corrA = calc_cc(scoreA, tac_scores, spearmanr, level, False)[aspect]
            corrB = calc_cc(scoreB, tac_scores, spearmanr, level, False)[aspect]
            if corrA < corrB: 
                print("Skip")
                f.write("-\t")
                continue
            
            count = 0
            for test in tqdm(tests):
                corrA = calc_cc(scoreA, tac_scores, spearmanr, level, False, test)[aspect]
                corrB = calc_cc(scoreB, tac_scores, spearmanr, level, False, test)[aspect]
                if corrA < corrB: count += 1

            pvalue = count / len(tests)
            print("P-value:", pvalue)
            f.write("%.2f\t" % pvalue)
        f.write("\n")

    f.write("\t".join([""] + targetB))
    f.close()

In [7]:
sig_test()

('billsum', 'pref_ordered') blanc


100%|██████████| 1000/1000 [03:35<00:00,  4.63it/s]


P-value: 0.054
('billsum', 'pref_ordered') supert


100%|██████████| 1000/1000 [03:35<00:00,  4.64it/s]


P-value: 0.202
('billsum', 'pref_ordered') summaqa_avg_fscore


100%|██████████| 1000/1000 [03:35<00:00,  4.63it/s]


P-value: 0.0
('billsum', 'pref_ordered') summaqa_avg_prob


100%|██████████| 1000/1000 [03:35<00:00,  4.63it/s]


P-value: 0.0
('scientific_papers', 'pref_ordered') blanc


100%|██████████| 1000/1000 [03:35<00:00,  4.64it/s]


P-value: 0.087
('scientific_papers', 'pref_ordered') supert


100%|██████████| 1000/1000 [03:35<00:00,  4.64it/s]


P-value: 0.304
('scientific_papers', 'pref_ordered') summaqa_avg_fscore


100%|██████████| 1000/1000 [03:35<00:00,  4.65it/s]


P-value: 0.0
('scientific_papers', 'pref_ordered') summaqa_avg_prob


100%|██████████| 1000/1000 [03:35<00:00,  4.65it/s]


P-value: 0.0
('cnn_dailymail', 'pref_ordered') blanc


100%|██████████| 1000/1000 [03:32<00:00,  4.71it/s]


P-value: 0.0
('cnn_dailymail', 'pref_ordered') supert


100%|██████████| 1000/1000 [03:31<00:00,  4.73it/s]


P-value: 0.0
('cnn_dailymail', 'pref_ordered') summaqa_avg_fscore


100%|██████████| 1000/1000 [03:31<00:00,  4.73it/s]


P-value: 0.0
('cnn_dailymail', 'pref_ordered') summaqa_avg_prob


100%|██████████| 1000/1000 [03:31<00:00,  4.73it/s]


P-value: 0.0
('big_patent', 'pref_ordered') blanc


100%|██████████| 1000/1000 [03:31<00:00,  4.72it/s]


P-value: 0.436
('big_patent', 'pref_ordered') supert
Skip
('big_patent', 'pref_ordered') summaqa_avg_fscore


100%|██████████| 1000/1000 [03:31<00:00,  4.73it/s]


P-value: 0.0
('big_patent', 'pref_ordered') summaqa_avg_prob


100%|██████████| 1000/1000 [03:31<00:00,  4.73it/s]

P-value: 0.0



