# Perform representation learning for test samples

In this notebook, we show how to learn the representation for test samples

## Prepare data

In [1]:
import numpy as np
import pickle
from tqdm import tqdm
from irt import *
from utils import *



In [2]:
to_handle_scenario = 'longbench'
scenarios

{'longbench': ['LongBench_2wikimqa',
  'LongBench_dureader',
  'LongBench_gov_report',
  'LongBench_hotpotqa',
  'LongBench_lcc',
  'LongBench_lsht',
  'LongBench_multifieldqa_en',
  'LongBench_multifieldqa_zh',
  'LongBench_multi_news',
  'LongBench_musique',
  'LongBench_narrativeqa',
  'LongBench_passage_count',
  'LongBench_passage_retrieval_en',
  'LongBench_passage_retrieval_zh',
  'LongBench_qasper',
  'LongBench_qmsum',
  'LongBench_repobench-p',
  'LongBench_samsum',
  'LongBench_trec',
  'LongBench_triviaqa',
  'LongBench_vcsum']}

Loading longbench test data:

In [3]:
with open('data/longbench.pickle', 'rb') as handle:
    data = pickle.load(handle)
names = np.array(data['models'])

Below, we will process the data so all correctness scores (for all scenarios) are stored in $Y$. The dictionaries `scenarios_position` and `subscenarios_position` give the position of scenarios/subscenarios correctness scores in $Y$.

In [4]:
scenarios_position, subscenarios_position = prepare_data(scenarios, data)
Y = create_responses(scenarios, data)
# print(scenarios_position, subscenarios_position)

    
Y.shape

(40, 4750)

below you can see the scores for LongBench:

In [5]:
Y[:,scenarios_position[to_handle_scenario]], Y[:,scenarios_position[to_handle_scenario]].shape
tmp = Y[:,scenarios_position[to_handle_scenario]]
np.mean(tmp), np.mean(tmp).shape

(0.28705801919587143, ())

## Split the data in train and test

In [6]:
train_idx = [32, 30, 25, 8, 0, 26, 7, 29, 12, 9, 23, 21, 1, 39, 6, 14, 11, 27, 20, 10]
test_idx = [13, 34, 3, 38, 22, 24, 35, 37, 5, 4, 2, 19, 28, 33, 17, 18, 15, 36, 16, 31]

Y_test = Y[test_idx]
Y_train = Y[train_idx]

print(Y.shape, Y_train.shape, Y_test.shape)

(40, 4750) (20, 4750) (20, 4750)


In [7]:
print(train_idx, test_idx)
names_train = np.sort(names[train_idx])
names_test = np.sort(names[test_idx])
print(names_train)
print()
print(names_test)

[32, 30, 25, 8, 0, 26, 7, 29, 12, 9, 23, 21, 1, 39, 6, 14, 11, 27, 20, 10] [13, 34, 3, 38, 22, 24, 35, 37, 5, 4, 2, 19, 28, 33, 17, 18, 15, 36, 16, 31]
['ALMA-7B-Ja-V2' 'GOAT-7B-Community' 'Koss-7B-chat'
 'Kunoichi-7BLlama-2-7b-chat-hf-10-sparsity' 'Llama-2-7b-ft-instruct-es'
 'Mistral-7B-Instruct-v0.2-sparsity-20' 'OLMo-1B' 'airoboros-7b'
 'gemma2-9b-hf' 'giraffe-7b' 'llama-2-7b-hf' 'llama-7b-hf'
 'mistral-7b-v0.1-hf' 'perry-7b' 'qwen-7b-hf' 'qwen1.5-0.5b-hf'
 'qwen2-0.5b-hf' 'qwen2.5-0.5b-base' 'qwen2.5-1.5b-base' 'tulu-7B-fp16']

['Distilled-HermesChat-7B' 'Loyal-Macaroni-Maid-7B' 'OLMo-1B-SFT'
 'StopCarbon-10.7B-v6' 'Synatra-RP-Orca-2-7b-v0.1' 'TowerInstruct-7B-v0.1'
 'amd-llama-135m' 'amd-llama-135m-code' 'gemma2-2b-hf' 'llama-3-8b-hf'
 'llama-30b' 'llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged'
 'llama-shishya-7b-ep3-v1' 'manatee-7b' 'mistral-7b-v0.3-hf'
 'qwen1.5-1.8b-hf' 'qwen2.5-0.5b-instruct-hf' 'qwen2.5-3b-base'
 'qwen2.5-3b-instruct-hf' 'recycled-wizardlm-7b-v2.0']


## Binarize all scores

In [8]:
Y_bin_train = np.zeros(Y_train.shape)
Y_bin_test = np.zeros(Y_test.shape)

cs = np.linspace(0.01,.99,100)  # Threshold values to consider
for scenario in scenarios.keys():
    ind = scenarios_position[scenario]
    # Find the best threshold value that minimizes the difference between averages
    c = cs[np.argmin([np.mean((np.abs((Y_train[:,ind]>c).mean(axis=1)-Y_train[:,ind].mean(axis=1)))) for c in tqdm(cs)])]
    # Apply the threshold to train and test responses
    Y_bin_train[:,ind] = (Y_train[:,ind]>c).astype(int)
    Y_bin_test[:,ind] = (Y_test[:,ind]>c).astype(int)

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1354.12it/s]


## Data preprocessing

Obtain the OpenAI embeddings for the 4750 test cases of Longbench, and then reduce the dimensionality of the embeddings using PCA.

Due to the large size of the embeddings, the full data cannot be uploaded. Only the results after PCA are provided here

In [9]:
# with open(f'data\\longbench_embeddings.pkl', 'rb') as f:
#     longbench_embeddings = pickle.load(f)
# print(longbench_embeddings.shape)
# from sklearn.decomposition import PCA
# pca = PCA(n_components=10)
# longbench_embeddings = pca.fit_transform(longbench_embeddings)
# print(longbench_embeddings.shape)
# with open(f'data\\longbench_embeddings_pca.pkl', 'wb') as f:
#     pickle.dump(longbench_embeddings, f)

(4750, 1024)
(4750, 10)


## Representation learning

To learn the representation for test samples, we use an adapted version of `py-irt` code

The following models will read `data\longbench_embeddings_pca.pkl` during initialization to set up the representations for test samples.

In [10]:
D = 10
device = 'cuda' # Either 'cuda' or 'cpu' 
epochs = 2000  # Number of epochs for IRT model training 
lr = .1  # Learning rate for IRT model training 


# Saving the training dataset in the needed format
create_irt_dataset(Y_bin_train, 'data/irt_dataset.jsonlines')


train_irt_model(dataset_name='data/irt_dataset.jsonlines', 
                model_name='data/new_irt_model', 
                D=D, lr=lr, epochs=epochs, device=device)      
A, B, Theta = load_irt_parameters('data/new_irt_model/')
print(A.shape, B.shape, Theta.shape)
print(B)

(1, 10, 4750) (1, 1, 4750) (20, 10, 1)
[[[0.14152852 0.64271206 0.6490193  ... 0.70366305 0.92900026 0.70741272]]]
