In [1]:
from tqdm.notebook import tqdm
import os
import requests
import json
import pandas as pd

version_to_run = "v1.0.0"
    
###################################################################################################################
tasks_path = "data/filtered_run_path"
#tasks_list = os.listdir(os.path.join(tasks_path, version_to_run))

def get_json_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
        json_data = response.json()
        return json_data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
        
overwrite = False

In [2]:
df = pd.read_csv('helm_lite.csv')


In [3]:
df.columns

Index(['Unnamed: 0', 'Run', 'Model', 'Groups', 'Adapter method',
       'Subject / Task'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Run,Model,Groups,Adapter method,Subject / Task
0,0,"commonsense:dataset=openbookqa,method=multiple...",01-ai/yi-34b,openbookqa,multiple_choice_joint,-
1,1,"commonsense:dataset=openbookqa,method=multiple...",01-ai/yi-6b,openbookqa,multiple_choice_joint,-
2,2,"commonsense:dataset=openbookqa,method=multiple...",AlephAlpha/luminous-base,openbookqa,multiple_choice_joint,-
3,3,"commonsense:dataset=openbookqa,method=multiple...",AlephAlpha/luminous-extended,openbookqa,multiple_choice_joint,-
4,4,"commonsense:dataset=openbookqa,method=multiple...",AlephAlpha/luminous-supreme,openbookqa,multiple_choice_joint,-


In [5]:
list(df.Model.unique())

['01-ai/yi-34b',
 '01-ai/yi-6b',
 'AlephAlpha/luminous-base',
 'AlephAlpha/luminous-extended',
 'AlephAlpha/luminous-supreme',
 'ai21/j2-grande',
 'ai21/j2-jumbo',
 'anthropic/claude-2.0',
 'anthropic/claude-2.1',
 'anthropic/claude-instant-1.2',
 'anthropic/claude-v1.3',
 'cohere/command',
 'cohere/command-light',
 'google/text-bison@001',
 'google/text-unicorn@001',
 'meta/llama-2-13b',
 'meta/llama-2-70b',
 'meta/llama-2-7b',
 'meta/llama-65b',
 'mistralai/mistral-7b-v0.1',
 'mistralai/mixtral-8x7b-32kseqlen',
 'openai/gpt-3.5-turbo-0613',
 'openai/gpt-4-0613',
 'openai/gpt-4-1106-preview',
 'openai/text-davinci-002',
 'openai/text-davinci-003',
 'tiiuae/falcon-40b',
 'tiiuae/falcon-7b',
 'writer/palmyra-x-v2',
 'writer/palmyra-x-v3']

In [6]:
df.Groups.value_counts()

Groups
math_chain_of_thought          210
legalbench                     150
mmlu                           150
wmt_14                         150
openbookqa                      30
gsm                             30
med_qa                          30
narrative_qa                    30
natural_qa_closedbook           30
natural_qa_openbook_longans     30
Name: count, dtype: int64

In [7]:
df[["Groups", "Subject / Task"]].value_counts()

Groups                       Subject / Task          
wmt_14                       -                           150
legalbench                   -                           150
openbookqa                   -                            30
natural_qa_openbook_longans  -                            30
natural_qa_closedbook        -                            30
narrative_qa                 -                            30
mmlu                         us_foreign_policy            30
                             econometrics                 30
                             computer_security            30
                             college_chemistry            30
gsm                          -                            30
med_qa                       -                            30
math_chain_of_thought        precalculus                  30
                             prealgebra                   30
                             number_theory                30
                             in

For each scenario, all models are run. MMLU and MATH are split into multiple subjects/tasks (see above). Legalbench is also split into 5 subsets (not recorded in the table above) and wmt_14 is split into 5 different (source_language, target_language) pairs.

Then for each llm you have:

In [8]:
840/30

28.0

Helm-Lite has 10 scenarios

v1.1.0 has 1 new model from Microsoft and an additional one from Mistral (32 models in v1.1.0 and 30 in v1.0.0).

There are 840 runs in v1.0.0 and 896 in v1.1.0: the 56 new runs are basically all the previous ones for the new models. 

For now I will download the v1.0.0 results, it does not change too much.

In [9]:
tasks_list = list(df.Run)

In [10]:
tasks_list

['commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2',
 'commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3',
 

In [None]:
import os, sys
wd = os.getcwd()

In [11]:
template_url = f"https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/{version_to_run}"
save_dir = f"{wd}/../../results/helm_lite_{version_to_run}"

The following cell downloads all the raw data.

In [12]:
for tasks in [tasks_list]:
    
    for task in tqdm(tasks):
        cur_save_dir = f"{save_dir}/{task}"
        os.makedirs(cur_save_dir, exist_ok=True)

        for file_type in [
                # "run_spec",
                # "stats",
                # "per_instance_stats",
                # "instances",
                "scenario_state",  # this file contains all the useful bits
                "display_predictions",
                # "display_requests",
                # "scenario",
        ]:
            save_path = f"{cur_save_dir}/{file_type}.json"
            if os.path.exists(save_path):
                with open(save_path) as f:
                    data = json.load(f)
                if data is None or overwrite:
                    download = True
                else: 
                    download = False
            else:
                download = True
                                
            #https://storage.googleapis.com/crfm-helm-public/benchmark_output/runs/v0.2.2/babi_qa:task=15,model=AlephAlpha_luminous-base/scenario_state.json
            
            if download:            
                cur_url = f"{template_url}/{task}/{file_type}.json"
                print("download ",  cur_url)
                json.dump(get_json_from_url(cur_url), open(save_path, "w"), indent=2)

  0%|          | 0/840 [00:00<?, ?it/s]

download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=anthropic_claude-v1.3/display_predictions.json
download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=cohere_command/display_predictions.json
download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=cohere_command-light/display_predictions.json
download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=google_text-bison@001/display_predictions.json
download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=google_text-unicorn@001/display_predictions.json
download  https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/natural_qa:mode=closedbook,model=meta_llama-2-13b/disp