This runs the IRT model on the two datasets once and for all and saves the results in the `data_irt` folder.

Note that running this notebook may take long and require a substantial amount of RAM.

In [None]:
from src.reference_benchmark import SampleSelector
from src.results_loaders import load_reasoning, load_helm_lite


def compute_irt(train_df, train_llms, n_embeddings_truncate, irt_file_prefix):
    # truncate the embeddings
    train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])

    # define the selector
    selector = SampleSelector(train_df, "openai_embeddings_subset", train_llms)

    # compute IRT with d=10
    selector.compute_IRT_tiny_benchmarks(10, epochs=2000,
                                         dataset_name=f'data_irt/{irt_file_prefix}_irtdataset.jsonlines',
                                         model_name=f'data_irt/{irt_file_prefix}_irtmodel/')


In [None]:
n_embeddings_truncate = 1024

## KindsOfReasoning

In [None]:
from src.utils import train_llms_reasoning as train_llms

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(train_llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings/")

    print("reasoning", split)

    compute_irt(train_df, train_llms, n_embeddings_truncate, f"reasoning_{split}")

## HELM-Lite


In [None]:
from src.utils import train_llms_helm as train_llms

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(train_llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings/")

    print("helm OOD LLMs", split)

    compute_irt(train_df, train_llms, n_embeddings_truncate, f"helm_{split}")
