In [43]:
import pandas as pd
import json
import os

## Extraction Info

### raw.csv
for each benchmark there is the raw.csv containing all metrics collected.
from this file, the following information can be extracted for each episode:
- game
- model
- episode
- experiment
- Aborted
- Lose
- Success

### requests.json
For each benchmark > model > game > experiment > episode there is a requests file containing the requests sent to the models and the received answers
following information can be extracted for each episode
- inputs
- responses

### instance.json
For each benchmark > model > game > experiment > episode there is a instance.json file containing metadata for the episode
e.g. for taboo it is the target word along with the taboo words or for wordle it is the target word along with a clue for the word.
the metadata will be extracted completely and stored as an additional column.


In [44]:
def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

In [68]:
def prepare_requests_json(path: str, input_parser: callable, output_parser: callable, model_name: str) -> dict:
    data: pd.DataFrame = pd.read_json(path)
    interactions: dict = {
        'requests': [],
        'responses': []
    }
    
    for index, row in data.iterrows():
        try: 
            interactions['requests'].append(input_parser(row.manipulated_prompt_obj))
            interactions['responses'].append(output_parser(row.raw_response_obj))
        except Exception as e:
            print(e)
            print(model_name)
            return

    try:
        assert len(interactions['requests']) == len(interactions['responses'])
    except AssertionError as e:
        print(e)
        print('length missmatch between requests and responses')
        print(model_name)
        return

        
    return interactions

In [69]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data


In [70]:
benchmark_versions_old: list = ['v0.9', 'v1.0']
interaction_response_token_old: list = ['completion', 'response']
lookup_response_token_old: dict = {key: val for key, val in zip(benchmark_versions_old, interaction_response_token_old)}
benchmark_versions_new: list = ['v1.5', 'v1.5_quantized','v1.6','v1.6_backends','v1.6_quantized',]
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']
games: list = ['privateshared', 'referencegame', 'taboo', 'wordle', 'wordle_withclue', 'wordle_withcritic']  # imagegame


In [73]:
## Requests
gpt_x_in: callable = lambda x: [t['content'] for t in x if t['role'] == 'user'][0]
is_as_string: callable = lambda x: x
inputs: callable = lambda x: x['inputs']
prompt: callable = lambda x: x['prompt']

## Responses
completion: callable = lambda x: x['completion']
completion_luminus: callable = lambda x: x['completions'][0]['completion']
response: callable = lambda x: x['response']
gpt_x_out: callable = lambda x: [t['message']['content'] for t in x['choices'] if t['message']['role'] == 'assistant']
choices_text: callable = lambda x: x['choices'][0]['text']
choices_content: callable = lambda x: [t['message']['content'] for t in x['choices'] if t['message']['role'] == 'assistant']

text: callable = lambda x: x['text']
content: callable = lambda x: x['content']

input_output_mapping: dict = {
    'sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-llama-2-70b-v1.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-t0.0--gpt-3.5-turbo-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'luminous-supreme-t0.0--luminous-supreme-t0.0': {
        'request': prompt,
        'response': completion_luminus
    },
    'vicuna-7b-v1.5-t0.0--vicuna-7b-v1.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'llama-2-13b-chat-hf-t0.0--llama-2-13b-chat-hf-t0.0': {
        'request': inputs,
        'response': content
    },
    'WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-3.5-turbo-t0.0--gpt-4-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'mistral-medium-t0.0--mistral-medium-t0.0': {
        'request': gpt_x_in,
        'response': choices_content
    },
    'Mixtral-8x7B-Instruct-v0.1-t0.0--Mixtral-8x7B-Instruct-v0.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'koala-13b-t0.0--koala-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'llama-2-7b-chat-hf-t0.0--llama-2-7b-chat-hf-t0.0': {
        'request': inputs,
        'response': content
    },
    'gpt-3.5-turbo-0613-t0.0--gpt-3.5-turbo-0613-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-4-0314-t0.0--gpt-4-0314-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'deepseek-llm-7b-chat-t0.0--deepseek-llm-7b-chat-t0.0': {
        'request': inputs,
        'response': response
    },
    'koala-13B-HF-t0.0--koala-13B-HF-t0.0': {
        'request': inputs,
        'response': response
    },
    'vicuna-33b-v1.3-t0.0--vicuna-33b-v1.3-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-2.1-t0.0--claude-2.1-t0.0': {
        'request': is_as_string,
        'response': completion,
    },
    'Mistral-7B-Instruct-v0.1-t0.0--Mistral-7B-Instruct-v0.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt4all-13b-snoozy-t0.0--gpt4all-13b-snoozy-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'llama-2-70b-chat-hf-t0.0--llama-2-70b-chat-hf-t0.0': {
        'request': inputs,
        'response': content
    },
    'Wizard-Vicuna-13B-Uncensored-HF-t0.0--Wizard-Vicuna-13B-Uncensored-HF-t0.0': {
        'request': inputs,
        'response': response
    },
    'openchat_3.5-t0.0--openchat_3.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'CodeLlama-34b-Instruct-hf-t0.0--CodeLlama-34b-Instruct-hf-t0.0': {
        'request': inputs,
        'response': response
    },
    'sheep-duck-llama-2-13b-t0.0--sheep-duck-llama-2-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'deepseek-llm-67b-chat-t0.0--deepseek-llm-67b-chat-t0.0': {
        'request': inputs,
        'response': response
    },
    'vicuna-13b-v1.5-t0.0--vicuna-13b-v1.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-2-t0.0--claude-2-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'zephyr-7b-alpha-t0.0--zephyr-7b-alpha-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-0613-t0.0--gpt-4-0613-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'SUS-Chat-34B-t0.0--SUS-Chat-34B-t0.0': {
        'request': inputs,
        'response': response
    },
    'tulu-2-dpo-70b-t0.0--tulu-2-dpo-70b-t0.0': {
        'request': inputs,
        'response': response
    },
    'zephyr-7b-beta-t0.0--zephyr-7b-beta-t0.0': {
        'request': inputs,
        'response': response
    },
    'text-davinci-003-t0.0--text-davinci-003-t0.0': {
        'request': is_as_string,
        'response': choices_text
    },
    'openchat-3.5-0106-t0.0--openchat-3.5-0106-t0.0': {
        'request': inputs,
        'response': response
    },
    'Nous-Hermes-2-Mixtral-8x7B-DPO-t0.0--Nous-Hermes-2-Mixtral-8x7B-DPO-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-v1.3-t0.0--claude-v1.3-t0.0': {
        'request': is_as_string,
        'response': completion
    },
    'oasst-12b-t0.0--oasst-12b-t0.0': {
        'request': inputs,
        'response': response
    },
    'tulu-2-dpo-7b-t0.0--tulu-2-dpo-7b-t0.0': {
        'request': inputs,
        'response': response
    },
    'openchat-3.5-1210-t0.0--openchat-3.5-1210-t0.0': {
        'request': inputs,
        'response': response
    },
    'command-t0.0--command-t0.0': {
        'request': is_as_string,
        'response': text
    },
    'oasst-sft-4-pythia-12b-epoch-3.5-t0.0--oasst-sft-4-pythia-12b-epoch-3.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'WizardLM-13b-v1.2-t0.0--WizardLM-13b-v1.2-t0.0': {
        'request': inputs,
        'response': response
    },
    'falcon-40b-t0.0--falcon-40b-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'vicuna-13b-t0.0--vicuna-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-t0.0--gpt-4-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'falcon-7b-instruct-t0.0--falcon-7b-instruct-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'claude-instant-1.2-t0.0--claude-instant-1.2-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'Yi-34B-Chat-t0.0--Yi-34B-Chat-t0.0': {
        'request': inputs,
        'response': response
    }
}


In [74]:
for benchmark_version in benchmark_versions_old:
    # read the raw_csv
    raw_csv_data: pd.DataFrame = pd.read_csv(f'./{benchmark_version}/raw.csv')
    
    # group by metric to obtain all episode information
    clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_csv_data, columns_to_keep=columns_to_keep_raw_csv)
    
    # loop over all entries and build paths
    for index, row in clean_csv_data.iterrows():
        # skip image game:
        if row.game == 'imagegame': continue
        
        # built paths
        path_requests_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/requests.json'
        path_instance_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/instance.json'

        parse_request: callable = input_output_mapping[row.model]['request']
        parse_response: callable = input_output_mapping[row.model]['response']

        # check that paths are correctly built
        assert os.path.isfile(path_requests_json)
        assert os.path.isfile(path_instance_json)

        prepare_requests_json(path=path_requests_json, input_parser=parse_request, output_parser=parse_response, model_name=row.model)

        instance_data: dict = prepare_instance_data(path=path_instance_json)

In [89]:
data: pd.DataFrame = pd.read_json('./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/wordle/0_high_frequency_words_no_clue_no_critic/episode_0/requests.json')

In [91]:
interactions: dict = {
    'requests': [],
    'responses': []
}

for index, row in data.iterrows():
    interactions['requests'].append(row.manipulated_prompt_obj)
    interactions['responses'].append(row.raw_response_obj['completion'])

In [86]:
interactions['responses'][0]

'guess: soare'

# Request format requests

- claude:
- - raw_response_obj.completion
- Falcon:
- - raw_response_obj.response
- GPT 3.5/turbo GPT4
- - manipulated_prompt_obj[role == user]
- - raw_response_obj.choices.message.content role == assistant
- Koala / oasst / vicuna / CodeLlama / deepseek / mistral / mixtral / Nous/ openchat / sheep / SUS / tulu / wizard / Yi / zephr
- - manipulated_prompt_obj.inputs
- - raw_response_obj.response
- luminus:
- - manipulated_prompt_obj.prompt
- - raw_response_obj.completions.completion
- Davinci
- - manipulated_prompt_obj
- - raw_response_obj.choices.text[0]
- command:
- - raw_request_object
- - raw_response_obj.text

# Request format interactions
- image game
- - "from": "GM",
        "to": "Player 1",
    "from": "Player 1",
        "to": "GM",
    "from": "GM",
        "to": "Player 2",
    "from": "Player 2",
        "to": "GM",

In [None]:
input_output_mapping: dict = {
    'sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-llama-2-70b-v1.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-t0.0--gpt-3.5-turbo-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'luminous-supreme-t0.0--luminous-supreme-t0.0': {
        'request': prompt,
        'response': completion
    },
    'vicuna-7b-v1.5-t0.0--vicuna-7b-v1.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'llama-2-13b-chat-hf-t0.0--llama-2-13b-chat-hf-t0.0': {
        'request': inputs,
        'response': response
    },
    'WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-3.5-turbo-t0.0--gpt-4-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'mistral-medium-t0.0--mistral-medium-t0.0': {
        'request': inputs,
        'response': response
    },
    'Mixtral-8x7B-Instruct-v0.1-t0.0--Mixtral-8x7B-Instruct-v0.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'koala-13b-t0.0--koala-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'llama-2-7b-chat-hf-t0.0--llama-2-7b-chat-hf-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-3.5-turbo-0613-t0.0--gpt-3.5-turbo-0613-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-4-0314-t0.0--gpt-4-0314-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'deepseek-llm-7b-chat-t0.0--deepseek-llm-7b-chat-t0.0': {
        'request': inputs,
        'response': response
    },
    'koala-13B-HF-t0.0--koala-13B-HF-t0.0': {
        'request': inputs,
        'response': response
    },
    'vicuna-33b-v1.3-t0.0--vicuna-33b-v1.3-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-2.1-t0.0--claude-2.1-t0.0': {
        'request': is_as_string,
        'response': completion
    },
    'Mistral-7B-Instruct-v0.1-t0.0--Mistral-7B-Instruct-v0.1-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt4all-13b-snoozy-t0.0--gpt4all-13b-snoozy-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'llama-2-70b-chat-hf-t0.0--llama-2-70b-chat-hf-t0.0': {
        'request': inputs,
        'response': response
    },
    'Wizard-Vicuna-13B-Uncensored-HF-t0.0--Wizard-Vicuna-13B-Uncensored-HF-t0.0': {
        'request': inputs,
        'response': response
    },
    'openchat_3.5-t0.0--openchat_3.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'CodeLlama-34b-Instruct-hf-t0.0--CodeLlama-34b-Instruct-hf-t0.0': {
        'request': inputs,
        'response': response
    },
    'sheep-duck-llama-2-13b-t0.0--sheep-duck-llama-2-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'deepseek-llm-67b-chat-t0.0--deepseek-llm-67b-chat-t0.0': {
        'request': inputs,
        'response': response
    },
    'vicuna-13b-v1.5-t0.0--vicuna-13b-v1.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-2-t0.0--claude-2-t0.0': {
        'request': is_as_string,
        'response': completion
    },
    'zephyr-7b-alpha-t0.0--zephyr-7b-alpha-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-0613-t0.0--gpt-4-0613-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'SUS-Chat-34B-t0.0--SUS-Chat-34B-t0.0': {
        'request': inputs,
        'response': response
    },
    'tulu-2-dpo-70b-t0.0--tulu-2-dpo-70b-t0.0': {
        'request': inputs,
        'response': response
    },
    'zephyr-7b-beta-t0.0--zephyr-7b-beta-t0.0': {
        'request': inputs,
        'response': response
    },
    'text-davinci-003-t0.0--text-davinci-003-t0.0': {
        'request': is_as_string,
        'response': choices_text
    },
    'openchat-3.5-0106-t0.0--openchat-3.5-0106-t0.0': {
        'request': inputs,
        'response': response
    },
    'Nous-Hermes-2-Mixtral-8x7B-DPO-t0.0--Nous-Hermes-2-Mixtral-8x7B-DPO-t0.0': {
        'request': inputs,
        'response': response
    },
    'claude-v1.3-t0.0--claude-v1.3-t0.0': {
        'request': is_as_string,
        'response': completion
    },
    'oasst-12b-t0.0--oasst-12b-t0.0': {
        'request': inputs,
        'response': response
    },
    'tulu-2-dpo-7b-t0.0--tulu-2-dpo-7b-t0.0': {
        'request': inputs,
        'response': response
    },
    'openchat-3.5-1210-t0.0--openchat-3.5-1210-t0.0': {
        'request': inputs,
        'response': response
    },
    'command-t0.0--command-t0.0': {
        'request': is_as_string,
        'response': text
    },
    'oasst-sft-4-pythia-12b-epoch-3.5-t0.0--oasst-sft-4-pythia-12b-epoch-3.5-t0.0': {
        'request': inputs,
        'response': response
    },
    'WizardLM-13b-v1.2-t0.0--WizardLM-13b-v1.2-t0.0': {
        'request': inputs,
        'response': response
    },
    'falcon-40b-t0.0--falcon-40b-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'vicuna-13b-t0.0--vicuna-13b-t0.0': {
        'request': inputs,
        'response': response
    },
    'gpt-4-t0.0--gpt-4-t0.0': {
        'request': gpt_x_in,
        'response': gpt_x_out
    },
    'falcon-7b-instruct-t0.0--falcon-7b-instruct-t0.0': {
        'request': is_as_string,
        'response': response
    },
    'claude-instant-1.2-t0.0--claude-instant-1.2-t0.0': {
        'request': is_as_string,
        'response': completion
    },
    'Yi-34B-Chat-t0.0--Yi-34B-Chat-t0.0': {
        'request': inputs,
        'response': response
    }
}
