In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm
from collections import defaultdict
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU, CHRF
from evaluate import load

In [2]:
rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
bleurt = load("bleurt", module_type="metric", checkpoint="bleurt-large-512")
bertscore = load("bertscore")
bleu = BLEU()
chrf = CHRF()

def compute_scores(refs, sample):
    ref_scores = defaultdict(list)
    for ref in refs:
        rouge_scores = rscorer.score(ref, sample)
        for rouge_metric, score_result in rouge_scores.items():
            ref_scores[f"{rouge_metric}_p"].append(score_result.precision)
            ref_scores[f"{rouge_metric}_r"].append(score_result.recall)
            ref_scores[f"{rouge_metric}_f1"].append(score_result.fmeasure)
        ref_scores["bleu"].append(bleu.corpus_score([sample], [[ref]]).score)
        ref_scores["chrf"].append(chrf.corpus_score([sample], [[ref]]).score)
        ref_scores["bleurt"].append(bleurt.compute(predictions=[sample], references=[ref])["scores"][0])
        bertscores = bertscore.compute(predictions=[sample], references=[ref], lang="en")
        ref_scores["bertscore_p"] = bertscores["precision"]
        ref_scores["bertscore_r"] = bertscores["recall"]
        ref_scores["bertscore_f1"] = bertscores["f1"]
    return ref_scores









INFO:tensorflow:Reading checkpoint C:\Users\Kai\.cache\huggingface\metrics\bleurt\default\downloads\extracted\66d40c89ded88d187db3310c752ad6bc55a18f1686c772fd971b1af93164b5f5\bleurt-base-128.


INFO:tensorflow:Reading checkpoint C:\Users\Kai\.cache\huggingface\metrics\bleurt\default\downloads\extracted\66d40c89ded88d187db3310c752ad6bc55a18f1686c772fd971b1af93164b5f5\bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.








INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [3]:
vistext_id_to_captions = pd.read_json("./vistext_eval/vistext_id_to_combined_captions.jsonl", orient="records", lines=True)
vistext_id_to_scores = np.load("./vistext_id_to_scores.npy", allow_pickle=True).item() #defaultdict(dict)
pbar = tqdm(total=len(vistext_id_to_captions))
pbar.clear()
def get_caption_scores(row):
    refs = row["human"]
    for caption_type in ["gpt-4-turbo", "gpt-4-turbo-alt", "gpt-4-turbo-table",
                         "gpt-4-turbo-L3", "gpt-4-turbo-alt-L3", "gpt-4-turbo-table-L3"]:
        if caption_type in row and row[caption_type]:
            processed_caption = row[caption_type][0]
            processed_caption = processed_caption.replace("This description was generated by a language model. ", "")
            vistext_id_to_scores[row["image_id"]][caption_type] = compute_scores(refs, processed_caption)
    pbar.update(1)

vistext_id_to_captions.apply(get_caption_scores, axis=1)
np.save("./vistext_id_to_scores", vistext_id_to_scores)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 882/882 [52:49<00:00,  5.44s/it]  

In [4]:
vistext_id_to_scores = np.load("./vistext_id_to_scores.npy", allow_pickle=True).item()
combined_method_scores = {}
for caption_type in ["heuristic", "gpt-4-turbo", "gpt-4-turbo-alt", "gpt-4-turbo-table", "gpt-4-turbo-L3", "gpt-4-turbo-alt-L3", "gpt-4-turbo-table-L3"]:
    combined_method_scores[caption_type] = defaultdict(list)
print(vistext_id_to_scores.keys())
for image_id, caption_scores in vistext_id_to_scores.items():
    for caption_type, scores_dict in caption_scores.items():
        for score_type, scores in scores_dict.items():
            combined_method_scores[caption_type][score_type].extend(scores)
caption_type_to_avg_scores = defaultdict(dict)
for caption_type, caption_scores in combined_method_scores.items():
    for score_type, scores in caption_scores.items():
        caption_type_to_avg_scores[caption_type][score_type] = np.mean(scores)

pprint(caption_type_to_avg_scores)

dict_keys([9, 22, 36, 37, 38, 68, 76, 99, 105, 130, 169, 179, 180, 184, 193, 195, 228, 241, 244, 260, 262, 268, 293, 309, 313, 315, 316, 344, 359, 373, 389, 418, 421, 425, 434, 442, 446, 450, 458, 459, 468, 512, 560, 561, 564, 572, 575, 584, 585, 595, 614, 616, 617, 634, 668, 681, 690, 706, 741, 753, 762, 765, 780, 781, 782, 783, 787, 797, 824, 825, 831, 845, 856, 867, 869, 871, 873, 878, 880, 923, 941, 957, 960, 964, 971, 980, 984, 995, 1018, 1046, 1051, 1073, 1081, 1086, 1088, 1093, 1107, 1111, 1121, 1150, 1152, 1159, 1163, 1168, 1170, 1186, 1199, 1202, 1223, 1224, 1226, 1227, 1231, 1244, 1248, 1249, 1283, 1293, 1316, 1347, 1351, 1358, 1374, 1376, 1385, 1388, 1389, 1404, 1407, 1420, 1439, 1444, 1453, 1469, 1481, 1483, 1497, 1499, 1501, 1511, 1512, 1513, 1515, 1516, 1542, 1559, 1566, 1567, 1568, 1597, 1601, 1659, 1682, 1701, 1704, 1706, 1726, 1733, 1740, 1752, 1754, 1779, 1784, 1793, 1795, 1802, 1808, 1833, 1842, 1843, 1852, 1863, 1869, 1870, 1871, 1894, 1895, 1901, 1907, 1911, 1947, 