In [230]:
import pandas as pd
import json
import os
from json import JSONDecodeError

## Extraction Info

### raw.csv
for each benchmark there is the raw.csv containing all metrics collected.
from this file, the following information can be extracted for each episode:
- game
- model
- episode
- experiment
- Aborted
- Lose
- Success

### requests.json
For each benchmark > model > game > experiment > episode there is a requests file containing the requests sent to the models and the received answers
following information can be extracted for each episode
- inputs
- responses

### instance.json
For each benchmark > model > game > experiment > episode there is a instance.json file containing metadata for the episode
e.g. for taboo it is the target word along with the taboo words or for wordle it is the target word along with a clue for the word.
the metadata will be extracted completely and stored as an additional column.


In [224]:
def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

In [225]:
def prepare_requests_json(path: str, input_parser: callable, output_parser: callable, model_name: str) -> dict:
    data: pd.DataFrame = pd.read_json(path)
    interactions: dict = {
        'requests': [],
        'responses': []
    }
    
    for index, row in data.iterrows():
        try: 
            interactions['requests'].append(input_parser(row.manipulated_prompt_obj))
            interactions['responses'].append(output_parser(row.raw_response_obj))
        except Exception as e:
            print(e)
            print(model_name)
            return

    try:
        assert len(interactions['requests']) == len(interactions['responses'])
    except AssertionError as e:
        print(e)
        print('length missmatch between requests and responses')
        print(model_name)
        return

        
    return interactions

## Prepare Taboo Data

In [226]:
def prepare_taboo_interaction_data(data: dict) -> dict:
    p1_initial_input: str = ''
    p2_initial_input: str = ''
    responses: list = []
    
    # loop over all turns
    for i, turn in enumerate(data['turns']):
        
        # loop over all actions inside one turn
        for action in turn:
            # save the initial input string that is used to introduce the game for player 1
            if i == 0 and action['to'] == 'Player 1' and p1_initial_input == '':
                p1_initial_input = action['action']['content']
                
            # save the initial input string that is used to introduce the game for player 2
            if i == 0 and action['to'] == 'Player 2' and p2_initial_input == '':
                p2_initial_input = action['action']['content']
                
            # only save the actions content where Player x to Gamemaster
            if action['from'] != action['to'] and action['action']['type'] == 'get message':
                    responses.append(action['action']['content'])
                
    return {'p1': p1_initial_input, 'p2': p2_initial_input, 'response': responses}

def update_taboo_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    # fill in the taboo specifics
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['player_1_initial_prompt'].append(game_data['p1'])
    _data['player_2_initial_prompt'].append(game_data['p2'])
    _data['conversation'].append(game_data['response'])
    _data['target_word'].append(instance_data['target_word'])
    _data['related_words'].append(instance_data['related_word'])
    return _data

## Prepare wordle no critic no clue Data

In [227]:
def prepare_wordle_no_clue_no_critic(data: dict) -> dict:
    initial_prompt: str = ''
    responses: list = []
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if i == 0 and action['action']['type'] == 'send message' and initial_prompt == '':
                initial_prompt = action['action']['content']
                continue # to skip that turn
    
            if action['action']['type'] == 'metadata':
                try:
                    guess: str = action['action']['game_info']['guess']
                    explanation: str = action['action']['game_info']['explanation']
                    responses.append({
                        'type': 'guess',
                        'guess': guess if guess == '' else f'guess: {guess}',
                        'explanation': explanation if explanation == 'INVALID FORMAT' else f'explanation: {explanation}',
                    })
                except KeyError:
                    continue
            if action['action']['type'] == 'send message':
                try:
                    responses.append({'type': 'feedback', 'guess_feedback': action['action']['content']})
                except KeyError:
                    continue
                    
    return {'initial_prompt': initial_prompt, 'responses': responses}

def update_wordle_no_clue_no_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['player_1_initial_prompt'].append(game_data['initial_prompt'])
    _data['conversation'].append(game_data['response'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    _data['target_word_clue'].append(instance_data['target_word_clue'])

    return _data

## prepare wordle with clue and critic

In [228]:
def prepare_wordle_clue_and_critic(data: dict) -> dict:
    initial_prompt_p1: str = ''
    initial_prompt_p2: str = ''
    responses: list = []
    
    is_initial_guess: bool = True
    is_initial_critic: bool = True
    
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if action['action']['type'] == 'send message' and action['to'] == 'Player 1' and initial_prompt_p1 == '':
                initial_prompt_p1 = action['action']['content']
                continue # to skip that turn
    
            if action['action']['type'] == 'send message' and action['to'] == 'Player 2' and initial_prompt_p2 == '':
                initial_prompt_p2 = action['action']['content']
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 1' and is_initial_guess:
                is_initial_guess = False
                responses.append({
                    'type': 'initial_guess',
                    'guess': action['action']['content'],
                    'explanation': action['action']['content'],
                })
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 2' and is_initial_critic:
                is_initial_critic = False
                responses.append({
                    'type': 'initial_critic',
                    'guess': action['action']['content'],
                    'explanation': action['action']['content'],
                })
    
            if action['action']['type'] == 'metadata' and 'game_info' in action['action'].keys():
                guess = action['action']['game_info']['guess']
                explanation = action['action']['game_info']['explanation']
                error = action['action']['game_info']['error']
                critic_agreement = action['action']['game_info']['critic_agreement']
                critic_explanation = action['action']['game_info']['critic_explanation']
    
                responses.append({
                    'type': 'guess',
                    'guess': guess,
                    'explanation': explanation,
                    'error': error,
                    'critic_agreement': critic_agreement,
                    'critic_explanation': critic_explanation,
                })
                
    return {'initial_prompt_p1': initial_prompt_p1, 'initial_prompt_p2': initial_prompt_p2, 'response': responses}

def update_wordle_clue_and_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['player_1_initial_prompt'].append(game_data['initial_prompt_p1'])
    _data['player_2_initial_prompt'].append(game_data['initial_prompt_p2'])
    _data['conversation'].append(game_data['response'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    _data['target_word_clue'].append(instance_data['target_word_clue'])
    

## Prepare reference game data

In [229]:
def prepare_reference_game_data(data: dict) -> dict:
    question_player_1: str = 'No Data'
    question_player_2: str = 'No Data'
    answer_player_1: str = 'No Data'
    answer_player_2: str = 'No Data'
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                question_player_1 = action['action']['content']
            if action['to'] == 'Player 2':
                question_player_2 = action['action']['content']
    
            if action['from'] == 'Player 1':
                answer_player_1 = action['action']['content']
            if action['from'] == 'Player 2':
                answer_player_2 = action['action']['content']
    return {'question_player_1': question_player_1, 'question_player_2': question_player_2, 'answer_player_1': answer_player_1, 'answer_player_2': answer_player_2,}

def update_reference_game_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['question_player_1'].append(game_data['question_player_1'])
    _data['answer_player_1'].append(game_data['answer_player_1'])
    _data['question_player_2'].append(game_data['question_player_2'])
    _data['answer_player_2'].append(game_data['answer_player_2'])
    _data['player_1_grids'].append([instance_data['player_1_target_grid'], instance_data['player_1_second_grid'], instance_data['player_1_third_grid'], ])
    _data['player_2_grids'].append([instance_data['player_2_first_grid'], instance_data['player_2_second_grid'], instance_data['player_2_third_grid'], ])
    _data['target_grid_name'].append(instance_data['target_grid_name'])
    
    return _data


### Open JSON files

In [216]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data


In [111]:
benchmark_versions_old: list = ['v0.9', 'v1.0']
interaction_response_token_old: list = ['completion', 'response']
lookup_response_token_old: dict = {key: val for key, val in zip(benchmark_versions_old, interaction_response_token_old)}
benchmark_versions_new: list = ['v1.5', 'v1.5_quantized','v1.6','v1.6_backends','v1.6_quantized',]
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']
games: list = ['privateshared', 'referencegame', 'taboo', 'wordle', 'wordle_withclue', 'wordle_withcritic']  # imagegame

In [149]:
taboo_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'player_2_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'related_words': []
}

wordle_no_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'target_word_difficulty': [],
    'target_word_clue': []
}

wordle_with_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'target_word_difficulty': [],
}

wordle_with_clue_with_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'player_2_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'target_word_difficulty': [],
}

reference_game: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_question': [],
    'player_2_question': [],
    'player_1_answer': [],
    'player_2_answer': [],
    'player_1_grids': [],
    'player_2_grids': [],
    'target_grid_name': []
}

In [112]:
extract_game_instance_data: dict = {
    'taboo': prepare_taboo_interaction_data,
    'wordle': prepare_wordle_no_clue_no_critic,
    'wordle_withclue': prepare_wordle_no_clue_no_critic,
    'wordle_withcritic': prepare_wordle_clue_and_critic,
    'referencegame': prepare_reference_game_data
}

update_game_data: dict = {
    'taboo': update_taboo_data,
    'wordle': update_wordle_no_clue_no_critic_data,
    'wordle_withclue': update_wordle_no_clue_no_critic_data,
    'wordle_withcritic': update_wordle_clue_and_critic_data,
    'referencegame': update_reference_game_data
}

data_collectors: dict = {
    'taboo': taboo_data,
    'wordle': wordle_no_clue_no_critic_data,
    'wordle_withclue': wordle_with_clue_no_critic_data,
    'wordle_withcritic': wordle_with_clue_with_critic_data,
    'referencegame': reference_game
}

In [115]:
for benchmark_version in benchmark_versions_old:
    # read the raw_csv
    raw_csv_data: pd.DataFrame = pd.read_csv(f'./{benchmark_version}/raw.csv')
    
    # group by metric to obtain all episode information
    clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_csv_data, columns_to_keep=columns_to_keep_raw_csv)
    
    # loop over all entries and build paths
    for index, row in clean_csv_data.iterrows():
        # skip image game:
        if row.game == 'imagegame': continue
        
        # built paths
        path_requests_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/requests.json'
        path_instance_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/instance.json'
        path_interaction_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/interactions.json'
        
        parse_request: callable = input_output_mapping[row.model]['request']
        parse_response: callable = input_output_mapping[row.model]['response']

        # check that paths are correctly built
        assert os.path.isfile(path_requests_json)
        assert os.path.isfile(path_instance_json)
        assert os.path.isfile(path_interaction_json)

        prepare_requests_json(path=path_requests_json, input_parser=parse_request, output_parser=parse_response, model_name=row.model)
        instance_data: dict = prepare_instance_data(path=path_instance_json)
        
        try:
            interaction_data: dict = prepare_instance_data(path_interaction_json)
        except JSONDecodeError as e:
            print(e)
            print(path_instance_json)
            break
        
        # get the data collector
        data_collector: dict = data_collectors[row.game]
        
        # extract the game data
        game_data: dict = extract_game_instance_data[row.game](data=interaction_data)
        
        # add data to the data collector
        data_collectors[row.game] = update_game_data[row.game](_data=data_collector, instance_data=instance_data, row=row, game_data=game_data)
            