# Directly calculate the scores of LLMs on MiniLongBench

In this notebook, we show how to obtain minilongbench socres directly

## Prepare data

In [1]:
import numpy as np
import pickle
from irt import *
from utils import *
import os
import time



In [2]:
with open('data/longbench.pickle', 'rb') as handle:
    data = pickle.load(handle)
scenarios_position, subscenarios_position = prepare_data(scenarios, data)
Y = create_responses(scenarios, data)
to_handle_scenario = 'longbench'

In [3]:
balance_weights = np.ones(Y.shape[1])
# per_scen indicates which scenario this document belongs to
per_scen = [1, 1, 2, 1, 5, 3, 0, 0, 2, 1, 0, 4, 4, 4, 0, 2, 5, 3, 3, 3, 2]
N = len(scenarios_position[to_handle_scenario])
n_sub = len(scenarios[to_handle_scenario])
for i, sub in enumerate(scenarios[to_handle_scenario]):
    if per_scen[i] == 4:
        num = 3
    elif per_scen[i] == 5:
        num = 2
    else:
        num = 4
    n_i = len(subscenarios_position[to_handle_scenario][sub])
    balance_weights[subscenarios_position[to_handle_scenario][sub]] = N/(num*6*n_i)  

## calculate the scores of LLMs on the MiniLongBench benchmark


In [4]:
def fun(arr, idx):
    arr = np.array(arr)
    res = np.empty_like(arr)
    for i in range(len(arr)):
        res[idx[i]] = arr[i]
    return res

to_handle_scenario = 'longbench'

# Load the test samples of MiniLongBench
with open("data/anchor.pkl", "rb") as f:
    anchor_points = pickle.load(f)
    
# minilongbench test cases
seen_items = np.hstack([np.array(scenarios_position[scenario])[anchor_points[scenario]] for scenario in scenarios.keys()]).tolist()
unseen_items = [i for i in range(4750) if i not in seen_items]

name = scenarios[to_handle_scenario]

# start indecates where the file begins
start = [0, 13, 19, 31, 44, 70, 78, 85, 100, 111, 118, 124, 128, 143, 158, 167, 173, 196, 211, 219, 231, 237]
# per_scen indicates which scenario this document belongs to
per_scen = [1, 1, 2, 1, 5, 3, 0, 0, 2, 1, 0, 4, 4, 4, 0, 2, 5, 3, 3, 3, 2]
# sort the sub-scenarios for easier result presentation
idx = [5, 7, 8, 4, 19, 15, 2, 3, 10, 6, 0, 16, 17, 18, 1, 9, 20, 14, 12, 13, 11]


# load the representations of the test samples
all_pred_scores = []
A, B, _ = load_irt_parameters('data/irt_model/')

# evaluate all models in the evaluation folder
for e, filename in enumerate(os.listdir('eval_data')):
    print(filename)
    
    file_path = os.path.join('eval_data', filename)
    with open(file_path, "rb") as f:
        minilongbench_scores = pickle.load(f)
        
    # scores for the 21 sub-scenarios
    score_per_file = []
    # scores for the 6 scenarios
    score_per_scen = [0 for i in range(6)]
    
    # The direct test scores of MiniLongBench 
    minilongbench_scores = np.array(minilongbench_scores)
    
    # Display the scores
    for i in range(21):
        tmp_score = minilongbench_scores[start[i]:start[i+1]].mean()
        score_per_file.append(tmp_score)
        score_per_scen[per_scen[i]] += tmp_score
    score_per_scen[0] /= 4
    score_per_scen[1] /= 4
    score_per_scen[2] /= 4
    score_per_scen[3] /= 4
    score_per_scen[4] /= 3
    score_per_scen[5] /= 2
    name = fun(name, idx)
    score_per_file = fun(score_per_file, idx)
    print()
    
    
    for i in range(21):
        print(f"{name[i]:<40} {np.round(score_per_file[i], 2)}")
    print()
    
    print("{:<40} {:.2f}".format("Single-Document QA", np.round(score_per_scen[0], 2)))
    print("{:<40} {:.2f}".format("Multi-Document QA", np.round(score_per_scen[1], 2)))
    print("{:<40} {:.2f}".format("Summarization", np.round(score_per_scen[2], 2)))
    print("{:<40} {:.2f}".format("Few-shot Learning", np.round(score_per_scen[3], 2)))
    print("{:<40} {:.2f}".format("Synthetic Task", np.round(score_per_scen[4], 2)))
    print("{:<40} {:.2f}".format("Code Completion", np.round(score_per_scen[5], 2)))

    print()
    en_idx = [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20]
    zh_idx = [3, 7, 11, 15, 18]
    en = np.mean(score_per_file[en_idx])
    zh = np.mean(score_per_file[zh_idx])
    print("{:<40} {:.2f}".format("EN", np.round(en, 2)))
    print("{:<40} {:.2f}".format("ZH", np.round(zh, 2)))
    print("{:<40} {:.2f}".format("ALL", np.round(np.mean(score_per_scen), 2)))

llama_minilongbench_scores.pkl

LongBench_narrativeqa                    0.03
LongBench_qasper                         0.11
LongBench_multifieldqa_en                0.23
LongBench_multifieldqa_zh                0.12
LongBench_hotpotqa                       0.07
LongBench_2wikimqa                       0.08
LongBench_musique                        0.15
LongBench_dureader                       0.04
LongBench_gov_report                     0.14
LongBench_qmsum                          0.05
LongBench_multi_news                     0.18
LongBench_vcsum                          0.12
LongBench_trec                           0.31
LongBench_triviaqa                       0.31
LongBench_samsum                         0.12
LongBench_lsht                           0.25
LongBench_passage_count                  0.0
LongBench_passage_retrieval_en           0.07
LongBench_passage_retrieval_zh           0.07
LongBench_lcc                            0.71
LongBench_repobench-p                    0.44

Si