In [12]:
import numpy as np
from pprint import pprint
from tqdm import tqdm
from collections import defaultdict
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU, CHRF, TER

In [10]:
rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
bleu = BLEU()
chrf = CHRF()

def compute_scores(refs, sample):
    ref_scores = []
    for ref in refs:
        cur_scores = {}
        rouge_scores = rscorer.score(ref, sample)
        for rouge_metric, score_result in rouge_scores.items():
            cur_scores[f"{rouge_metric}_p"] = score_result.precision
            cur_scores[f"{rouge_metric}_r"] = score_result.recall
            cur_scores[f"{rouge_metric}_f1"] = score_result.fmeasure
        cur_scores["bleu"] = bleu.corpus_score([sample], [[ref]]).score
        cur_scores["chrf"] = chrf.corpus_score([sample], [[ref]]).score
        ref_scores.append(cur_scores)
    return ref_scores

In [13]:
vistext_captions = np.load("./vistext_to_mpl/vistext_id_to_captions.npy", allow_pickle=True).item()
matplotalt_captions = np.load("./vistext_to_mpl/vistext_id_to_matplotalt_captions.npy", allow_pickle=True).item()
gpt4v_captions = np.load("./vistext_to_mpl/vistext_id_to_gpt4v_captions.npy", allow_pickle=True).item()

heuristic_scores = {}
gpt4v_scores = {}
heuristic_avg_scores = defaultdict(float)
gpt4v_avg_scores = defaultdict(float)
num_samples = len(gpt4v_captions)
for chart_id in tqdm(gpt4v_captions.keys()):
    # Combine L1 and L2+ captions:
    ref_captions = [cap["L1"] + cap["L2L3"] for cap in vistext_captions[chart_id]]
    num_refs = len(ref_captions)
    heuristic_scores[chart_id] = compute_scores(ref_captions, matplotalt_captions[chart_id])
    gpt4v_scores[chart_id] = compute_scores(ref_captions, gpt4v_captions[chart_id])
    for caption_score in heuristic_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            heuristic_avg_scores[metric] += (metric_score / num_refs)
    for caption_score in gpt4v_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            gpt4v_avg_scores[metric] += (metric_score / num_refs)

for metric in heuristic_avg_scores.keys():
    heuristic_avg_scores[metric] /= num_samples
for metric in gpt4v_avg_scores.keys():
    gpt4v_avg_scores[metric] /= num_samples

pprint(heuristic_avg_scores)
pprint(gpt4v_avg_scores)

100%|██████████| 100/100 [00:05<00:00, 19.74it/s]

defaultdict(<class 'float'>,
            {'bleu': 16.540560622991155,
             'chrf': 44.772266293347364,
             'rouge1_f1': 0.5725829051880719,
             'rouge1_p': 0.601090520198768,
             'rouge1_r': 0.5616816763796633,
             'rouge2_f1': 0.35125909567922414,
             'rouge2_p': 0.36745948390107247,
             'rouge2_r': 0.3457935983653739,
             'rougeL_f1': 0.41039645715226974,
             'rougeL_p': 0.4292843430994817,
             'rougeL_r': 0.40405176601888415,
             'rougeLsum_f1': 0.4176004554190515,
             'rougeLsum_p': 0.43819347649255214,
             'rougeLsum_r': 0.409780888459035})
defaultdict(<class 'float'>,
            {'bleu': 10.350772219532587,
             'chrf': 43.45331353767975,
             'rouge1_f1': 0.37183044947375515,
             'rouge1_p': 0.25689516175819455,
             'rouge1_r': 0.7025395276006272,
             'rouge2_f1': 0.19992057400852206,
             'rouge2_p': 0.1372735564




In [14]:
heuristic_scores = {}
gpt4v_scores = {}
heuristic_avg_max_scores = defaultdict(float)
gpt4v_avg_max_scores = defaultdict(float)
num_samples = len(gpt4v_captions)
for chart_id in tqdm(gpt4v_captions.keys()):
    # Combine L1 and L2+ captions:
    ref_captions = [cap["L1"] + cap["L2L3"] for cap in vistext_captions[chart_id]]
    num_imgs = len(set())
    heuristic_scores[chart_id] = compute_scores(ref_captions, matplotalt_captions[chart_id])
    gpt4v_scores[chart_id] = compute_scores(ref_captions, gpt4v_captions[chart_id])

    max_heuristic_scores = heuristic_scores[chart_id][0]
    for caption_score in heuristic_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            max_heuristic_scores[metric] = max(max_heuristic_scores[metric], metric_score)
    for metric, metric_score in caption_score.items():
        heuristic_avg_max_scores[metric] += max_heuristic_scores[metric]

    max_gpt4v_scores = gpt4v_scores[chart_id][0]
    for caption_score in gpt4v_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            max_gpt4v_scores[metric] = max(max_gpt4v_scores[metric], metric_score)
    for metric, metric_score in caption_score.items():
        gpt4v_avg_max_scores[metric] += max_gpt4v_scores[metric]

for metric in heuristic_avg_max_scores.keys():
    heuristic_avg_max_scores[metric] /= num_samples
for metric in gpt4v_avg_max_scores.keys():
    gpt4v_avg_max_scores[metric] /= num_samples

pprint(heuristic_avg_max_scores)
pprint(gpt4v_avg_max_scores)

100%|██████████| 100/100 [00:05<00:00, 19.91it/s]

defaultdict(<class 'float'>,
            {'bleu': 17.0387738558721,
             'chrf': 45.875211312538575,
             'rouge1_f1': 0.5800621156121178,
             'rouge1_p': 0.6069744788336024,
             'rouge1_r': 0.5776650606422175,
             'rouge2_f1': 0.3582866181251763,
             'rouge2_p': 0.36913674290073617,
             'rouge2_r': 0.3585578079986235,
             'rougeL_f1': 0.41689796531692375,
             'rougeL_p': 0.4326594501660035,
             'rougeL_r': 0.41657283308095644,
             'rougeLsum_f1': 0.4244132600159041,
             'rougeLsum_p': 0.44340128479715285,
             'rougeLsum_r': 0.4219865011345621})
defaultdict(<class 'float'>,
            {'bleu': 10.574568916494481,
             'chrf': 43.96768531636082,
             'rouge1_f1': 0.3796436157827411,
             'rouge1_p': 0.26492455452770897,
             'rouge1_r': 0.7125593471154261,
             'rouge2_f1': 0.20359026310199727,
             'rouge2_p': 0.140289844773




In [9]:
heuristic_scores = {}
gpt4v_scores = {}
heuristic_avg_scores = defaultdict(float)
gpt4v_avg_scores = defaultdict(float)
num_samples = len(gpt4v_captions)
for chart_id in tqdm(gpt4v_captions.keys()):
    # Just L2+ captions:
    ref_captions = [cap["L2L3"] for cap in vistext_captions[chart_id]]
    num_refs = len(ref_captions)
    heuristic_scores[chart_id] = compute_scores(ref_captions, matplotalt_captions[chart_id])
    gpt4v_scores[chart_id] = compute_scores(ref_captions, gpt4v_captions[chart_id])
    for caption_score in heuristic_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            heuristic_avg_scores[metric] += (metric_score / num_refs)
    for caption_score in gpt4v_scores[chart_id]:
        for metric, metric_score in caption_score.items():
            gpt4v_avg_scores[metric] += (metric_score / num_refs)

for metric in heuristic_avg_scores.keys():
    heuristic_avg_scores[metric] /= num_samples
for metric in gpt4v_avg_scores.keys():
    gpt4v_avg_scores[metric] /= num_samples

print(heuristic_avg_scores)
print(gpt4v_avg_scores)

100%|██████████| 100/100 [00:01<00:00, 56.65it/s]

defaultdict(<class 'float'>, {'rouge1_p': 0.19284805854368084, 'rouge1_r': 0.4738301760868292, 'rouge1_f1': 0.2596389568612051, 'rougeL_p': 0.12798409638061822, 'rougeL_r': 0.3230583730233787, 'rougeL_f1': 0.17353513826269723, 'bleu': 2.695336052982247, 'chrf': 31.3504902398399})
defaultdict(<class 'float'>, {'rouge1_p': 0.08714004571220785, 'rouge1_r': 0.6055190621662573, 'rouge1_f1': 0.1476649683244705, 'rougeL_p': 0.06330650358703468, 'rougeL_r': 0.46128790448466056, 'rougeL_f1': 0.10805419457886326, 'bleu': 1.7124975383414258, 'chrf': 23.32196328069717})



