In [1]:
import pandas as pd
import numpy as np
import sklearn
from utils import choose_best_threshold
import warnings
warnings.filterwarnings('ignore')

from utils import SOTA, XFORMER, OLD, MAPPING

In [2]:
# read data where
# 1. duplication has been removed.
# 2. examples that annotated in different datasets with different factual consistency labels are manually corrected based on our judgment.
df = pd.read_csv("data/aggre_fact_final.csv")

# split data
df_val = df[df.cut == 'val']
df_val_sota = df_val[df_val.model_name.isin(SOTA)]
df_test = df[df.cut == 'test']
df_test_sota = df_test[df_test.model_name.isin(SOTA)]

dataset_list = ['XSumFaith', 'Polytope', 'FactCC', 'SummEval', 'FRANK', 'Wang20', 'CLIFF', 'Goyal21', 'Cao22']
systems = ['DAE', 'QuestEval', 'SummaC-ZS', 'SummaC-Conv', 'QAFactEval']
origins = ['cnndm', 'xsum']

In [3]:
main_df = pd.DataFrame(
    columns=['system', 'origin', 'count', 'dataset', 'category', 'bl_acc']
)

results = []

for system in systems:
    df[f'{system}_label'] = None

for system in systems:
    for origin in origins:
        for dataset in dataset_list:
            for i, model_novelty in enumerate([SOTA, XFORMER, OLD]):
                df_val_temp = df_val[(df_val.dataset == dataset) & (df_val.origin == origin) & (df_val.model_name.isin(model_novelty))]
                df_test_temp = df_test[(df_test.dataset == dataset) & (df_test.origin == origin) & (df_test.model_name.isin(model_novelty))]

                if len(df_val_temp) > 0 and len(df_test_temp) > 0:
                    best_thresh, best_f1 = choose_best_threshold(df_val_temp.label.values, df_val_temp[f'{system}_score'].values)

                    scores_test = df_test_temp[f'{system}_score'].values
                    preds_test = [1 if score > best_thresh else 0 for score in scores_test]
                    df.loc[df_test_temp.index, f'{system}_label'] = preds_test
                    
                    balanced_acc = sklearn.metrics.balanced_accuracy_score(df_test_temp.label.values, preds_test)

                    main_df.loc[len(main_df.index)] = [
                        system, origin, len(preds_test), dataset, MAPPING[i], balanced_acc
                    ]

                    results.append({"system": system, "dataset_name": dataset, 'origin': origin, 
                    'count': len(scores_test), 'cat': MAPPING[i], "labels": df_test_temp.label.values, 
                    "preds": preds_test, "scores": scores_test})

df = df.reindex(
    columns=['dataset', 'origin', 'id', 'doc', 'summary', 'model_name', 'label',
       'cut', 'DAE_score', 'DAE_label', 'QuestEval_score', 'QuestEval_label',
       'SummaC-ZS_score', 'SummaC-ZS_label', 'SummaC-Conv_score', 'SummaC-Conv_label', 
       'QAFactEval_score' , 'QAFactEval_label'],
)

#  Dataset-wise comparsion between factuality systems

In [4]:
# Table 8
main_df_pivot_bacc = main_df.pivot(index=['origin', 'dataset', 'category', 'count'], columns='system', values='bl_acc')
main_df_pivot_bacc = main_df_pivot_bacc.reindex(columns=systems)
main_df_pivot_bacc.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,system,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval
origin,dataset,category,count,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cnndm,CLIFF,SOTA,150,0.73,0.74,0.646,0.649,0.716
cnndm,FRANK,OLD,523,0.704,0.67,0.692,0.727,0.773
cnndm,FRANK,SOTA,175,0.699,0.626,0.57,0.601,0.547
cnndm,FRANK,XFORMER,175,0.574,0.556,0.631,0.634,0.646
cnndm,FactCC,OLD,503,0.704,0.655,0.835,0.891,0.843
cnndm,Goyal21,OLD,25,0.188,0.146,0.375,0.354,0.271
cnndm,Polytope,OLD,450,0.779,0.687,0.802,0.791,0.824
cnndm,Polytope,SOTA,34,0.294,0.176,0.971,0.735,0.324
cnndm,Polytope,XFORMER,150,0.774,0.733,0.97,0.811,0.726
cnndm,SummEval,OLD,548,0.661,0.649,0.773,0.801,0.814


# AggreFact-CNN

In [5]:
# Table 4
scores = []
for cat in MAPPING.values():
    score = []
    for system in systems:
        system_df = main_df[(main_df.system == system) & (main_df.category == cat) & (main_df.origin == 'cnndm')]
        value = sum(system_df['count'] * system_df['bl_acc']) / sum(system_df['count'])
        score.append(round(value, 3))
    scores.append(score)

weighted_df = pd.DataFrame(
    scores,
    columns=systems,
    index=['SOTA', 'XFORMER', 'OLD']
)
weighted_df

Unnamed: 0,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval
SOTA,0.594,0.637,0.633,0.703,0.616
XFORMER,0.679,0.643,0.765,0.698,0.691
OLD,0.697,0.652,0.763,0.79,0.803


# AggreFact-XSum

In [6]:
# Table 4
scores = []
for cat in MAPPING.values():
    score = []
    for system in systems:
        system_df = main_df[(main_df.system == system) & (main_df.category == cat) & (main_df.origin == 'xsum')]
        value = sum(system_df['count'] * system_df['bl_acc']) / sum(system_df['count'])
        score.append(round(value, 3))
    scores.append(score)

weighted_df = pd.DataFrame(
    scores,
    columns=systems,
    index=['SOTA', 'XFORMER', 'OLD']
)
weighted_df

Unnamed: 0,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval
SOTA,0.731,0.616,0.561,0.668,0.66
XFORMER,0.855,0.601,0.514,0.646,0.596
OLD,0.834,0.597,0.533,0.675,0.605


# AggreFact-CNN/XSum-SOTA

In [7]:
from utils import resample_balanced_acc

main_sota_df = pd.DataFrame(
    columns=['system', 'origin', 'bl_acc']
)

results = []

for system in systems:
    for origin in origins:
            df_val_temp = df_val_sota[(df_val_sota.origin == origin)]
            df_test_temp = df_test_sota[(df_test_sota.origin == origin)]

            best_thresh, best_f1 = choose_best_threshold(df_val_temp.label.values, df_val_temp[f'{system}_score'].values)

            scores_test = df_test_temp[f'{system}_score'].values
            preds_test = [1 if score > best_thresh else 0 for score in scores_test]
            
            f1_score = sklearn.metrics.balanced_accuracy_score(df_test_temp.label.values, preds_test)

            main_sota_df.loc[len(main_sota_df.index)] = [
                system, origin, f1_score
            ]

            results.append({"system": system, 'origin': origin,  "labels": df_test_temp.label.values, 
            "preds": preds_test, "scores": scores_test})

In [8]:
# Table 5
# standard deviation may differ due to randomness

# from https://github.com/tingofurro/summac/
P5 = 5 / 2 # Correction due to the fact that we are running 2 tests with the same data
P1 = 1 / 2 # Correction due to the fact that we are running 2 tests with the same data

for origin in origins:
    sampled_batch_preds = {res["system"]: [] for res in results}
    
    for res in results:
        if res['origin'] == origin:
    
            samples = resample_balanced_acc(res["preds"], res["labels"])
            sampled_batch_preds[res["system"]].append(samples)
            low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100-P5)
            low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100-P1)
            bacc = sklearn.metrics.balanced_accuracy_score(res["labels"], res["preds"])

            print(res['origin'].center(6), res["system"].center(20), " - %.3f, %.3f" % (bacc, bacc-low5))
    print()

cnndm          DAE           - 0.654, 0.036
cnndm       QuestEval        - 0.702, 0.031
cnndm       SummaC-ZS        - 0.640, 0.032
cnndm      SummaC-Conv       - 0.610, 0.032
cnndm       QAFactEval       - 0.678, 0.034

 xsum          DAE           - 0.702, 0.018
 xsum       QuestEval        - 0.595, 0.021
 xsum       SummaC-ZS        - 0.564, 0.014
 xsum      SummaC-Conv       - 0.650, 0.020
 xsum       QAFactEval       - 0.639, 0.020

