# Directly calculate the scores of LLMs on MiniLongBench

In this notebook, we show how to obtain minilongbench socres directly

## Prepare data

In [1]:
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from irt import *
from utils import *



In [2]:
to_handle_scenario = 'longbench'
scenarios

{'longbench': ['LongBench_2wikimqa',
  'LongBench_dureader',
  'LongBench_gov_report',
  'LongBench_hotpotqa',
  'LongBench_lcc',
  'LongBench_lsht',
  'LongBench_multifieldqa_en',
  'LongBench_multifieldqa_zh',
  'LongBench_multi_news',
  'LongBench_musique',
  'LongBench_narrativeqa',
  'LongBench_passage_count',
  'LongBench_passage_retrieval_en',
  'LongBench_passage_retrieval_zh',
  'LongBench_qasper',
  'LongBench_qmsum',
  'LongBench_repobench-p',
  'LongBench_samsum',
  'LongBench_trec',
  'LongBench_triviaqa',
  'LongBench_vcsum']}

Loading longbench test data:

In [3]:
with open('data/longbench.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [4]:
scenarios_position, subscenarios_position = prepare_data(scenarios, data)
Y = create_responses(scenarios, data)

Y.shape

(40, 4750)

In [5]:
balance_weights = np.ones(Y.shape[1])
# per_scen indicates which scenario this document belongs to
per_scen = [1, 1, 2, 1, 5, 3, 0, 0, 2, 1, 0, 4, 4, 4, 0, 2, 5, 3, 3, 3, 2]
N = len(scenarios_position[to_handle_scenario])
n_sub = len(scenarios[to_handle_scenario])
for i, sub in enumerate(scenarios[to_handle_scenario]):
    if per_scen[i] == 4:
        num = 3
    elif per_scen[i] == 5:
        num = 2
    else:
        num = 4
    n_i = len(subscenarios_position[to_handle_scenario][sub])
    balance_weights[subscenarios_position[to_handle_scenario][sub]] = N/(num*6*n_i)  

## Split the data in train and test

In [6]:
train_idx = [32, 30, 25, 8, 0, 26, 7, 29, 12, 9, 23, 21, 1, 39, 6, 14, 11, 27, 20, 10]
test_idx = [13, 34, 3, 38, 22, 24, 35, 37, 5, 4, 2, 19, 28, 33, 17, 18, 15, 36, 16, 31]

Y_test = Y[test_idx]
Y_train = Y[train_idx]
Y_test_set = Y[test_idx]

print(Y.shape, Y_train.shape, Y_test.shape)

(40, 4750) (20, 4750) (20, 4750)


## calculate the scores of LLMs on the MiniLongBench benchmark


In [7]:
# number_item = Y_train.shape[1]
from scipy.stats import spearmanr, kendalltau

scenario_dict = {"Single-Document QA":["LongBench_narrativeqa", "LongBench_qasper", "LongBench_multifieldqa_en", "LongBench_multifieldqa_zh"],
                "Multi-Document QA":["LongBench_hotpotqa", "LongBench_2wikimqa", "LongBench_musique", "LongBench_dureader"],
                "Summarization":["LongBench_gov_report", "LongBench_qmsum", "LongBench_vcsum", "LongBench_samsum"],
                "Few-shot Learning":["LongBench_trec", "LongBench_lsht", "LongBench_triviaqa", "LongBench_multi_news"],
                "Code Completion":["LongBench_lcc", "LongBench_repobench-p"],
                "Synthetic Task":["LongBench_passage_count", "LongBench_passage_retrieval_en", "LongBench_passage_retrieval_zh"]}


A, B, _ = load_irt_parameters('data/irt_model/')
X = np.vstack((A.squeeze(), B.squeeze().reshape((1,-1)))).T
# X = np.vstack((A.squeeze())).T
X = X[scenarios_position['longbench']]
norm_balance_weights = balance_weights[scenarios_position['longbench']]
norm_balance_weights /= norm_balance_weights.sum()
scenario = 'longbench'
with open('data/sub_scenarios_pospos.pkl', 'rb') as f:
    sub_scenarios_pospos = pickle.load(f)
    
whole_sp = []
test_sp = []
train_sp = []
whole_error = []
test_error = []
train_error = []
whole_sub_sp = [[] for i in range(6)]
test_sub_sp = [[] for i in range(6)]
train_sub_sp = [[] for i in range(6)]
whole_sub_score = [[] for i in range(6)]
test_sub_score = [[] for i in range(6)]
train_sub_score = [[] for i in range(6)]
whole_sub_pred_score = [[] for i in range(6)]
test_sub_pred_score = [[] for i in range(6)]
train_sub_pred_score = [[] for i in range(6)]

ratio = 0.95
name = list(subscenarios_position['longbench'].keys())
number_item = int((1-ratio) * 4750)

corelation = []
sub_sp = [[] for i in range(6)]

for e in range(3):
    # eval on all LLMs
    if e == 0:
        Y_test = Y
    # eval on test LLMs
    elif e == 1:
        Y_test = Y_test_set
    # eval on train LLMs
    else:
        Y_test = Y_train

    # Load the test cases of MiniLongBench
    with open("data/anchor.pkl", "rb") as f:
        anchor_points = pickle.load(f)

    start = [0, 200, 400, 600, 800, 1300, 1500, 1650, 1850, 2050, 2250, 2450, 2650, 2850, 3050, 3250, 3450, 3950, 4150, 4350, 4550, 4750]
    # per_scen indicates which scenario this document belongs to
    per_scen = [1, 1, 2, 1, 4, 3, 0, 0, 2, 1, 0, 5, 5, 5, 0, 2, 4, 3, 3, 3, 2]
    anchor_per_file = [[] for i in range(21)]
    scores_per_file = [[] for i in range(21)]
    anchor_scores_per_file = [[] for i in range(21)]
    scores_per_scen = [np.zeros((Y_test.shape[0])) for i in range(6)]
    anchor_scores_per_scen = [np.zeros((Y_test.shape[0])) for i in range(6)]
    for item in sorted(anchor_points['longbench']):
        for fi, s in enumerate(start[1:]):
            if item < s:
                anchor_per_file[fi].append(item)
                break
    summm = np.zeros((Y_test.shape[0]))

    for i in range(21):
        scores_per_file[i] = Y_test[:, subscenarios_position['longbench'][name[i]]].mean(axis=1)
        anchor_scores_per_file[i] = Y_test[:, anchor_per_file[i]].mean(axis=1)
        scores_per_scen[per_scen[i]] += Y_test[:, subscenarios_position['longbench'][name[i]]].mean(axis=1)
        anchor_scores_per_scen[per_scen[i]] += Y_test[:, anchor_per_file[i]].mean(axis=1)
        if i == 4 or i == 16:
            summm += Y_test[:, anchor_per_file[i]].mean(axis=1) / 2 / 6
        elif i == 11 or i == 12 or i == 13:
            summm += Y_test[:, anchor_per_file[i]].mean(axis=1) / 3 / 6
        else:
            summm += Y_test[:, anchor_per_file[i]].mean(axis=1) / 4 / 6
    

    if e == 0:
        whole_sub_score = scores_per_scen
        whole_sub_pred_score = anchor_scores_per_scen
    elif e == 1:
        test_sub_score = scores_per_scen
        test_sub_pred_score = anchor_scores_per_scen
    else:
        train_sub_score = scores_per_scen
        train_sub_pred_score = anchor_scores_per_scen

    
    Y_true = (balance_weights*Y_test)[:,scenarios_position[scenario]].mean(axis=1)
    spearman_corr, spearman_p_value = spearmanr(summm, Y_true)
    corelation.append(spearman_corr)
    for i in range(6):
        spearman_corr, spearman_p_value = spearmanr(scores_per_scen[i], anchor_scores_per_scen[i])
        sub_sp[i].append(spearman_corr)

print("The rank correlation between all LLMs on MiniLongBench and LongBench", corelation[0])
print("The rank correlation between test LLMs on MiniLongBench and LongBench", corelation[1])
print("The rank correlation between train LLMs on MiniLongBench and LongBench", corelation[2])
print(sub_sp)

The rank correlation between all LLMs on MiniLongBench and LongBench 0.9485928705440902
The rank correlation between test LLMs on MiniLongBench and LongBench 0.9368421052631578
The rank correlation between train LLMs on MiniLongBench and LongBench 0.9458646616541352
[[0.9500938086303942, 0.9654135338345864, 0.9067669172932331], [0.9724202626641651, 0.9654135338345864, 0.9533834586466164], [0.9183864915572235, 0.9428571428571428, 0.8947368421052632], [0.9369606003752347, 0.9834586466165413, 0.8511278195488721], [0.95422138836773, 0.9548872180451128, 0.9278195488721804], [0.8418633099271289, 0.806015037593985, 0.8266266097929232]]
