In [52]:
import pandas as pd
import json
import os

## Extraction Info

### raw.csv
for each benchmark there is the raw.csv containing all metrics collected.
from this file, the following information can be extracted for each episode:
- game
- model
- episode
- experiment
- Aborted
- Lose
- Success

### requests.json
For each benchmark > model > game > experiment > episode there is a requests file containing the requests sent to the models and the received answers
following information can be extracted for each episode
- inputs
- responses

### instance.json
For each benchmark > model > game > experiment > episode there is a instance.json file containing metadata for the episode
e.g. for taboo it is the target word along with the taboo words or for wordle it is the target word along with a clue for the word.
the metadata will be extracted completely and stored as an additional column.


In [53]:
def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

In [101]:
def prepare_requests_json(path: str, response_key: str) -> dict:
    data: pd.DataFrame = pd.read_json(path)
    interactions: dict = {
        'requests': [],
        'responses': []
    }
    
    for index, row in data.iterrows():
        try: 
            interactions['requests'].append(row.manipulated_prompt_obj)
            interactions['responses'].append(row.raw_response_obj['response'])
        except KeyError as e:
            interactions['requests'].append(row.manipulated_prompt_obj)
            interactions['responses'].append(row.raw_response_obj['completion'])
            
    
    try:
        assert len(interactions['requests']) == len(interactions['responses'])
    except AssertionError as e:
        print(e)
        print('length missmatch between requests and responses')
        exit(-1)
        
    return interactions

In [95]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data
    

In [96]:
benchmark_versions_old: list = ['v0.9', 'v1.0']
interaction_response_token_old: list = ['completion', 'response']
lookup_response_token_old: dict = {key: val for key, val in zip(benchmark_versions_old, interaction_response_token_old)}
benchmark_versions_new: list = ['v1.5', 'v1.5_quantized','v1.6','v1.6_backends','v1.6_quantized',]
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']
games: list = ['privateshared', 'referencegame', 'taboo', 'wordle', 'wordle_withclue', 'wordle_withcritic']  # imagegame


In [102]:
for benchmark_version in benchmark_versions_old:
    # read the raw_csv
    raw_csv_data: pd.DataFrame = pd.read_csv(f'./{benchmark_version}/raw.csv')
    
    # group by metric to obtain all episode information
    clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_csv_data, columns_to_keep=columns_to_keep_raw_csv)
    
    # loop over all entries and build paths
    for index, row in clean_csv_data.iterrows():
        # skip image game:
        if row.game == 'imagegame': continue
        
        # built paths
        path_requests_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/requests.json'
        path_instance_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/instance.json'
        print(path_requests_json)
        
        # check that paths are correctly built
        assert os.path.isfile(path_requests_json)
        assert os.path.isfile(path_instance_json)
        
        interaction_data: dict = prepare_requests_json(path=path_requests_json, response_key=lookup_response_token_old[benchmark_version])
        instance_data: dict = prepare_instance_data(path=path_instance_json)

./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_0/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_1/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_2/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_3/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_4/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_5/requests.json

length missmatch between requests and responses
./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared/0_travel-booking/episode_6/requests.json

length missmatch between r

KeyError: 'completion'

In [89]:
data: pd.DataFrame = pd.read_json('./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/wordle/0_high_frequency_words_no_clue_no_critic/episode_0/requests.json')

In [91]:
interactions: dict = {
    'requests': [],
    'responses': []
}

for index, row in data.iterrows():
    interactions['requests'].append(row.manipulated_prompt_obj)
    interactions['responses'].append(row.raw_response_obj['completion'])

In [86]:
interactions['responses'][0]

'guess: soare'