In [1]:
import json
import random
import re
import tqdm
from collections import defaultdict
from sacrebleu.metrics import BLEU

## 1.Utility Functions

In [None]:
def read_llava_prediction_file(file_path):
    """
    Read LLaVA's inference results (e.g., merge.jsonl) and extract data of different benchmarks based on 'category' field.
    """
    predict_results = []
    with open(file_path, encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm.tqdm(lines):
            item = json.loads(line.strip())
            predict_results.append(item)
    print("Predicted Sample Number:",len(predict_results))
    benchmark_name_to_predicted_item_list = defaultdict(list)
    for item in predict_results:
        item_id = item['question_id']
        category = item['category'] # {dataset_name}_for_{task_name}, e.g., TabFact_for_TFV
        dataset_name = category.split('_for_')[0] # e.g., TabFact
        task_name = category.split('_for_')[1] # e.g., TFV
        # for table structure understanding tasks, benchmark name is the task name
        if task_name not in ['TSD','TCL','RCE','MCD','TCE','TR','OOD_TSD','OOD_TCL','OOD_RCE','OOD_TCE']:
            benchmark_name = dataset_name
        else:
            benchmark_name = task_name
        benchmark_name_to_predicted_item_list[benchmark_name].append(item)
    for benchmark_name,  predicted_item_list in benchmark_name_to_predicted_item_list.items():
        item_num = len(predicted_item_list)
        print(f'benchmark name: {benchmark_name}, test data num: {item_num}')
    return benchmark_name_to_predicted_item_list

## 2.Read Predicted Data and Ground Truth Data

In [None]:
# read the ground truth data
MMTab_eval_test_data = json.load(open("eval_data.json"))
# item_id --> test data
item_id_to_test_item = {}
for item in MMTab_eval_test_data:
    item_id = item['item_id']
    item_id_to_test_item[item_id] = item
print("MMTab-eval data num: ",len(MMTab_eval_test_data))

## 3.Evaluation Functions

### 3.1 TQA, TFV and T2T Tasks

In [None]:
def extract_tqa_answer_list(model_output):
    """
    Extract the answer list from the model output to compute accuracy
    """
    model_output = model_output.replace('\n',' ')
    ret = re.match('.*({[\"\']answer[\"\']\:.*}).*',model_output)
    if ret is not None:
        answer_str = ret.group(1)
        try:
            answer_str = re.sub('[\"\']+',"\"",answer_str)
            answer_item = eval(answer_str)
            predicted_answer = answer_item['answer']
            if type(predicted_answer) != list and type(predicted_answer) == str:
                predicted_answer = [predicted_answer]
            elif type(predicted_answer) != list and type(predicted_answer) in [float,int]:
                predicted_answer = [str(predicted_answer)]
            else:
                pass
        # The answer is considered to be wrong if we can not extract answer list from the json str
        except:
            predicted_answer = []
        return predicted_answer
    else:
        return []

def evaluate_tqa_questions(benchmark_name,pred_item_list):
    """
    Evaluation for table question answering (TQA) and table fact verification (TFV) benchmark.
    Metric: accuracy.
    Note that some baseline models can not strictly follow instructions to output the final answer in the required JSON format.
    For instance, Qwen-VL may only output a short answer due to the potential overfitting of training data.
    In such cases, the evaluation script needs to be changed according to the characteristic of certain model output.
    """
    correct_item_list = []
    wrong_item_list = []
    failed_item_list = []
    for item in pred_item_list:
        try:
            item_id = item['question_id']
            ori_item = item_id_to_test_item[item_id]
            model_output = item['text']
            # parse the predicted answer list
            predicted_answer_list = extract_tqa_answer_list(model_output)
            gold_answer_list = ori_item['answer_list']
            # Sometimes the order of multiple answer text is not necessarily same as the gold answer,
            # so we convert the answer list to a set for comparison
            if set(gold_answer_list) == set(predicted_answer_list):
                correct_item_list.append(item)
        except Exception:
            failed_item_list.append(item)
            
    print("Benchmark: ",benchmark_name)
    correct_num = len(correct_item_list)
    total_sample_num = len(pred_item_list)
    print("Accuracy: ", correct_num/total_sample_num)
    problem_sample_num = len(failed_item_list)
    print("Total sample number:",total_sample_num)
    print(f"There are {problem_sample_num} samples that failed to be evaluated.")
    print("-"*20)

def evaluate_tqa_questions_hitab(benchmark_name, pred_item_list):
    """
    Evaluation for table question answering (TQA) and table fact verification (TFV) benchmark.
    Metric: accuracy.
    Note that some baseline models can not strictly follow instructions to output the final answer in the required JSON format.
    For instance, Qwen-VL may only output a short answer due to the potential overfitting of training data.
    In such cases, the evaluation script needs to be changed according to the characteristic of certain model output.
    """
    correct_item_list = []
    wrong_item_list = []
    failed_item_list = []

    def parse_answer(answer):
        """
        Parse the answer using eval and return a number if it is a number, otherwise return the original string.
        """
        try:
            evaluated = eval(answer)  # Try to evaluate the answer
            if isinstance(evaluated, (int, float)):  # Check if the result is a number
                return float(evaluated)
            return answer  # Return the original string if not a number
        except (NameError, SyntaxError, TypeError):
            return answer  # Return the original string if eval fails

    for item in pred_item_list:
        try:
            item_id = item['question_id']
            ori_item = item_id_to_test_item[item_id]
            model_output = item['text']
            # Parse the predicted answer list
            predicted_answer_list = extract_tqa_answer_list(model_output)
            gold_answer_list = ori_item['answer_list']

            # Parse answers using the new logic
            predicted_answer_list = [parse_answer(ans) for ans in predicted_answer_list]
            gold_answer_list = [parse_answer(ans) for ans in gold_answer_list]

            # Compare the sets of answers
            if set(gold_answer_list) == set(predicted_answer_list):
                correct_item_list.append(item)
            else:
                wrong_item_list.append(item)
                # print(predicted_answer_list, item_id)
        except Exception as e:
            print(f"Error evaluating item {item.get('question_id', 'unknown')}: {e}")
            failed_item_list.append(item)

    print("Benchmark: ", benchmark_name)
    correct_num = len(correct_item_list)
    total_sample_num = len(pred_item_list)
    print("Accuracy: ", correct_num / total_sample_num)
    problem_sample_num = len(failed_item_list)
    print("Total sample number:", total_sample_num)
    print(f"There are {problem_sample_num} samples that failed to be evaluated.")
    print("-" * 20)

def evaluate_text_generation_questions(benchmark_name,pred_item_list):
    """
    Evaluation for table-to-text benchmark.
    Metric: bleu.
    More metrics like ROUGE or LLM-as-a-judge rating are needed for a more robust evaluation.
    """
    bleu = BLEU()
    output_text_list = [] # output text 
    reference_text_list = [] # reference text list
    for item in pred_item_list:
        pred_text = item['text']
        item_id = item['question_id']
        ori_item = item_id_to_test_item[item_id]
        gold_text = ori_item['output']
        assert gold_text not in ['','None']
        output_text_list.append(pred_text)
        reference_text_list.append(gold_text)
    assert len(output_text_list) == len(reference_text_list)
    bleu_score = bleu.corpus_score(output_text_list, [reference_text_list])
    print("Benchmark: ",benchmark_name)
    print("BLEU score:",bleu_score)
    print("-"*20)

In [None]:
# read the predicted data
benchmark_name_to_predicted_item_list = read_llava_prediction_file("")

In [None]:
benchmark_name_list = ['TABMWP','WTQ','HiTab','TAT-QA','TabFact','InfoTabs','FeTaQA']


for benchmark_name in benchmark_name_list:
    predicted_item_list = benchmark_name_to_predicted_item_list[benchmark_name]
    if benchmark_name in ["TABMWP", "WTQ", "TAT-QA", "TabFact", "InfoTabs"]:
        evaluate_tqa_questions(benchmark_name,predicted_item_list)
    if benchmark_name in ["HiTab"]:
        evaluate_tqa_questions_hitab(benchmark_name,predicted_item_list)
    elif benchmark_name in ["FeTaQA"]:
        evaluate_text_generation_questions(benchmark_name,predicted_item_list)