In [2]:
import pandas as pd
import json
import os
from json import JSONDecodeError

## Extraction Info

### raw.csv
for each benchmark there is the raw.csv containing all metrics collected.
from this file, the following information can be extracted for each episode:
- game
- model
- episode
- experiment
- Aborted
- Lose
- Success

### requests.json
For each benchmark > model > game > experiment > episode there is a requests file containing the requests sent to the models and the received answers
following information can be extracted for each episode
- inputs
- responses

### instance.json
For each benchmark > model > game > experiment > episode there is a instance.json file containing metadata for the episode
e.g. for taboo it is the target word along with the taboo words or for wordle it is the target word along with a clue for the word.
the metadata will be extracted completely and stored as an additional column.


In [3]:
def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

In [4]:
def prepare_requests_json(path: str, input_parser: callable, output_parser: callable, model_name: str) -> dict:
    data: pd.DataFrame = pd.read_json(path)
    interactions: dict = {
        'requests': [],
        'responses': []
    }
    
    for index, row in data.iterrows():
        try: 
            interactions['requests'].append(input_parser(row.manipulated_prompt_obj))
            interactions['responses'].append(output_parser(row.raw_response_obj))
        except Exception as e:
            print(e)
            print(model_name)
            return

    try:
        assert len(interactions['requests']) == len(interactions['responses'])
    except AssertionError as e:
        print(e)
        print('length missmatch between requests and responses')
        print(model_name)
        return

        
    return interactions

## Prepare Taboo Data

In [5]:
def prepare_taboo_interaction_data(data: dict) -> dict:
    p1_initial_input: str = ''
    p2_initial_input: str = ''
    responses: list = []
    
    # loop over all turns
    for i, turn in enumerate(data['turns']):
        
        # loop over all actions inside one turn
        for action in turn:
            # save the initial input string that is used to introduce the game for player 1
            if i == 0 and action['to'] == 'Player 1' and p1_initial_input == '':
                p1_initial_input = action['action']['content']
                
            # save the initial input string that is used to introduce the game for player 2
            if i == 0 and action['to'] == 'Player 2' and p2_initial_input == '':
                p2_initial_input = action['action']['content']
                
            # only save the actions content where Player x to Gamemaster
            if action['from'] != action['to'] and action['action']['type'] == 'get message':
                    responses.append(action['action']['content'])
                
    return {'p1': p1_initial_input, 'p2': p2_initial_input, 'response': responses}

def update_taboo_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    # fill in the taboo specifics
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['player_1_initial_prompt'].append(game_data['p1'])
    _data['player_2_initial_prompt'].append(game_data['p2'])
    _data['conversation'].append(game_data['response'])
    _data['target_word'].append(instance_data['target_word'])
    _data['related_words'].append(instance_data['related_word'])
    return _data

## Prepare wordle no critic no clue Data

In [6]:
def prepare_wordle_no_clue_no_critic(data: dict) -> dict:
    initial_prompt: str = ''
    responses: list = []
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if i == 0 and action['action']['type'] == 'send message' and initial_prompt == '':
                initial_prompt = action['action']['content']
                continue # to skip that turn
    
            if action['action']['type'] == 'metadata':
                try:
                    guess: str = action['action']['game_info']['guess']
                    explanation: str = action['action']['game_info']['explanation']
                    responses.append({
                        'type': 'guess',
                        'guess': guess if guess == '' else f'guess: {guess}',
                        'explanation': explanation if explanation == 'INVALID FORMAT' else f'explanation: {explanation}',
                    })
                except KeyError:
                    continue
            if action['action']['type'] == 'send message':
                try:
                    responses.append({'type': 'feedback', 'guess_feedback': action['action']['content']})
                except KeyError:
                    continue
                    
    return {'initial_prompt': initial_prompt, 'responses': responses}

def update_wordle_no_clue_no_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict) -> dict:
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['player_1_initial_prompt'].append(game_data['initial_prompt'])
    _data['conversation'].append(game_data['response'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    _data['target_word_clue'].append(instance_data['target_word_clue'])

    return _data

## prepare wordle with clue and critic

In [91]:
def prepare_wordle_clue_and_critic(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list  = []
    
    has_answered: bool = False
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if action['action']['type'] == 'send message' and action['to'] == 'Player 2':
                chat_p2.append( { 'role': USER,  'content': action['action']['content'] })
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 2':
                chat_p2.append( {'role': ASSISTANT, 'content': action['action']['content']})
                continue
    
            if action['action']['type'] == 'send message' and action['to'] == 'Player 1':
                chat_p1.append( { 'role': USER,  'content': action['action']['content'] })
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 1':
                has_answered = True
                continue
    
            if action['action']['type'] == 'parse' and has_answered:
                has_answered = False
                try: 
                    guess = action['action']['content']['guess']
                except KeyError:
                    guess = action['action']['content']['guess:']
                
                try:
                    explanation: str = action['action']['content']['explanation']
                except KeyError:
                    explanation: str = action['action']['content']['explanation:']
                
                
                has_error: bool = guess == ''
                chat_p1.append( {'role': ASSISTANT, 'content': f'guess: {guess}\nexplanation: {explanation}', 'has_error': has_error})
                continue
                
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_wordle_clue_and_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    
    return _data
    

## Prepare reference game data

In [92]:
def prepare_reference_game_data(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list = []
    
    USER: str = 'user'
    ASSISTANT: str = 'assistant'
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content']
                })
            if action['to'] == 'Player 2':
                chat_p2.append({
                    'role': USER,
                    'content': action['action']['content']
                })
    
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content']
                })
            if action['from'] == 'Player 2':
                chat_p2.append({
                    'role': ASSISTANT,
                    'content': action['action']['content']
                })
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_reference_game_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_grid_name'].append(instance_data['target_grid_name'])
    
    return _data


## Private Shared

In [142]:
def prepare_private_shared_data(data: dict) -> dict:
    chat_p1: list = []
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
                
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
    return {'chat_p1': chat_p1}

def update_private_shared(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['slots'].append(instance_data['slots'])
    
    return _data

## Prepare Image game

In [160]:
def prepare_image_game_data(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list = []
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
            if action['to'] == 'Player 2':
                chat_p2.append({
                    'role': USER,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
    
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                })
            if action['from'] == 'Player 2':
                chat_p2.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                })
                
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_image_game_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_grid'].append(instance_data['target_grid'])

    return _data

### Open JSON files

In [161]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data


In [162]:
benchmark_versions_old: list = ['v0.9', 'v1.0', 'v1.5', 'v1.5_quantized', 'v1.6', 'v1.6_quantized']
interaction_response_token_old: list = ['completion', 'response']
lookup_response_token_old: dict = {key: val for key, val in zip(benchmark_versions_old, interaction_response_token_old)}
benchmark_versions_new: list = ['v1.5', 'v1.5_quantized','v1.6','v1.6_backends','v1.6_quantized',]
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']
games: list = ['privateshared', 'referencegame', 'taboo', 'wordle', 'wordle_withclue', 'wordle_withcritic', 'imagegame']  # imagegame

In [163]:
taboo_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'player_2_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'related_words': []
}

wordle_no_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'target_word_difficulty': [],
    'target_word_clue': []
}

wordle_with_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'player_1_initial_prompt': [],
    'conversation': [],
    'target_word': [],
    'target_word_difficulty': [],
}

wordle_with_clue_with_critic_data: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_word': [],
    'target_word_difficulty': [],
}

reference_game: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_grid_name': []
}

privateshared: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'slots': []
}

image_game: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_grid': []
}

In [164]:
extract_game_instance_data: dict = {
    'taboo': prepare_taboo_interaction_data,
    'wordle': prepare_wordle_no_clue_no_critic,
    'wordle_withclue': prepare_wordle_no_clue_no_critic,
    'wordle_withcritic': prepare_wordle_clue_and_critic,
    'referencegame': prepare_reference_game_data,
    'privateshared': prepare_private_shared_data,
    'imagegame': prepare_image_game_data,
}

update_game_data: dict = {
    'taboo': update_taboo_data,
    'wordle': update_wordle_no_clue_no_critic_data,
    'wordle_withclue': update_wordle_no_clue_no_critic_data,
    'wordle_withcritic': update_wordle_clue_and_critic_data,
    'referencegame': update_reference_game_data,
    'privateshared': update_private_shared,
    'imagegame': update_image_game_data,
}

data_collectors: dict = {
    'taboo': taboo_data,
    'wordle': wordle_no_clue_no_critic_data,
    'wordle_withclue': wordle_with_clue_no_critic_data,
    'wordle_withcritic': wordle_with_clue_with_critic_data,
    'referencegame': reference_game,
    'privateshared': privateshared,
    'imagegame': image_game,
}

In [165]:
USER = 'user'
ASSISTANT = 'assistant'

for benchmark_version in benchmark_versions_old:
    # read the raw_csv
    raw_csv_data: pd.DataFrame = pd.read_csv(f'./{benchmark_version}/raw.csv')
    
    # group by metric to obtain all episode information
    clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_csv_data, columns_to_keep=columns_to_keep_raw_csv)
    
    # loop over all entries and build paths
    for index, row in clean_csv_data.iterrows():
        
        # built paths
        path_requests_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/requests.json'
        path_instance_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/instance.json'
        path_interaction_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/interactions.json'

        # check that paths are correctly built
        try:
            assert os.path.isfile(path_requests_json)
            assert os.path.isfile(path_instance_json)
            assert os.path.isfile(path_interaction_json)
        except AssertionError:
            print('assertion error in version', benchmark_version, ' on path_instance_json', path_instance_json, ' on path_requests_json', path_requests_json)
            continue
        
        if row.game == 'imagegame':
            print(row.game)
            instance_data: dict = prepare_instance_data(path=path_instance_json)

            try:
                interaction_data: dict = prepare_instance_data(path_interaction_json)
            except JSONDecodeError as e:
                print(e)
                print(path_instance_json)
                continue
            
            # get the data collector
            data_collector: dict = data_collectors[row.game]
            
            if not data_collector:
                print(row.game)
            
            # extract the game data
            
            game_data: dict = extract_game_instance_data[row.game](data=interaction_data)

            # add data to the data collector
            data_collectors[row.game] = update_game_data[row.game](_data=data_collector, instance_data=instance_data, row=row, game_data=game_data, bv=benchmark_version)
            

imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame
imagegame


In [166]:
d = pd.DataFrame(data=data_collectors['imagegame'])
len(d)

6920

In [167]:
d.head(1000)

Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat_p1,chat_p2,target_grid
0,imagegame,v0.9,0,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_0,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",B ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ...
1,imagegame,v0.9,1,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_1,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",S S S S S\n▢ ▢ ▢ ▢ S\n▢ ▢ ▢ ▢ S\n▢ ▢ ▢ ▢ S\n▢ ...
2,imagegame,v0.9,10,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_10,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",Y ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ...
3,imagegame,v0.9,11,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_11,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",Z Z Z Z Z\nZ ▢ ▢ ▢ Z\nZ ▢ ▢ ▢ Z\nZ ▢ ▢ ▢ Z\nZ ...
4,imagegame,v0.9,12,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_12,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",P P P P P\nP ▢ ▢ ▢ P\nP ▢ ▢ ▢ P\nP ▢ ▢ ▢ P\nP ...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,imagegame,v1.0,5,command-t0.0--command-t0.0,1_random_grids,episode_5,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...",[],▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\nA A ▢ A ▢\n▢ ▢ ▢ ▢ A\nA ...
996,imagegame,v1.0,6,command-t0.0--command-t0.0,1_random_grids,episode_6,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...",[],▢ J J ▢ ▢\n▢ ▢ ▢ ▢ J\n▢ ▢ J ▢ ▢\nJ J ▢ J ▢\n▢ ...
997,imagegame,v1.0,7,command-t0.0--command-t0.0,1_random_grids,episode_7,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...",[],C ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\nC ▢ ▢ ▢ ▢\nC ...
998,imagegame,v1.0,8,command-t0.0--command-t0.0,1_random_grids,episode_8,1.0,0.0,0.0,"[{'role': 'user', 'content': 'Let us play a ga...",[],▢ F ▢ ▢ ▢\nF ▢ F ▢ ▢\n▢ ▢ F ▢ ▢\nF ▢ F ▢ ▢\n▢ ...


In [168]:
d.to_json('./Data/imagegame.json', orient='records')

In [169]:
c = pd.read_json('./Data/imagegame.json')

In [170]:
len(c)

6920

In [171]:
c.head()

Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat_p1,chat_p2,target_grid
0,imagegame,v0.9,0,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_0,1,0,0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",B ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ▢ ▢ ▢ ▢\nB ...
1,imagegame,v0.9,1,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_1,1,0,0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",S S S S S\n▢ ▢ ▢ ▢ S\n▢ ▢ ▢ ▢ S\n▢ ▢ ▢ ▢ S\n▢ ...
2,imagegame,v0.9,10,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_10,1,0,0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",Y ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ▢ ▢ ▢ Y\nY ...
3,imagegame,v0.9,11,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_11,1,0,0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",Z Z Z Z Z\nZ ▢ ▢ ▢ Z\nZ ▢ ▢ ▢ Z\nZ ▢ ▢ ▢ Z\nZ ...
4,imagegame,v0.9,12,claude-v1.3-t0.0--claude-v1.3-t0.0,0_compact_grids,episode_12,1,0,0,"[{'role': 'user', 'content': 'Let us play a ga...","[{'role': 'user', 'content': 'Let us draw some...",P P P P P\nP ▢ ▢ ▢ P\nP ▢ ▢ ▢ P\nP ▢ ▢ ▢ P\nP ...


In [19]:
path_requests_json: str = './v1.6/Meta-Llama-3-8B-Instruct-hf-t0.0--Meta-Llama-3-8B-Instruct-hf-t0.0/privateshared/0_travel-booking/episode_0/requests.json'

In [20]:
data: pd.DataFrame = pd.read_json(path_requests_json)

## Format
### Player 1
--USER--

GENERAL PROMPT TEXT

clue:

--ASSISTANT--

guess:

explanation:

--USER--

guess:

guess_agreement:

agreement_explanation:

-- ASSISTANT --
guess: 

explanation:

If the guess here is valid continue with next guess angreement.
If the guess is not valid add the following:

-- USER --

Info what is wrong such as the word should have excatly 5 letters...

-- ASSISTANT --


### Player 2

--USER--
GENERAL 

PROMPT TEXT

clue:
 
guess:

explanation:

--ASSISTANT--

agreement: 

explanation: 

In [21]:
for i, row in data.iterrows():
    print(row.manipulated_prompt_obj['inputs'])
    break

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a customer of a travel agency. Here is a description of the details of the travel you want to make:

WHAT: Travel
FROM: London
TO: Amsterdam
BY: Plane
CLASS: The cheapest
WHEN: On March 3

The travel agent does not know about it yet. 

Questions from the travel agent will start with TRAVEL-AGENT. Please reply in the form: ANSWER: <some text>

I will also ask you questions. These questions will start with ME:. Please answer these questions with: ASIDE: <some text>

Important: Give short, direct answers!

Let us start.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ok.<|eot_id|><|start_header_id|>user<|end_header_id|>

ME:  Have you informed the travel agent about the place of departure? Answer this question with yes or no and then stop.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [13]:
for i, row in data.iterrows():
    if i == 3:
        print(row.manipulated_prompt_obj['inputs'])
        break

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a language wizard who likes to guess words by using the given rules.

Welcome to Wordle! You have six attempts to guess the target word, a valid English word of five lowercase letters (a-z). Please use the tags "guess:" and "explanation:" to provide a concise explanation for each guess.

To help you make an informed guess, you will receive a clue for the word, such as
clue: "snowy white"

Here is an example guess based on the clue:
guess: apple
explanation: In the fairy tail Snow White, the girl is killed because she eats a poisoned apple. And the word apple has 5 letters.

I will then indicate whether I agree or disagree with your guess and provide rationale, but agreeing with a guess does not confirm its correctness. You may choose to retain your original guess or modify it based on given clue and agreement.

After each guess, your answer will be validated, and you will receive feedback indicating which letters are co

In [9]:
for i, row in data.iterrows():
    if i == 1:
        print(row.manipulated_prompt_obj['inputs'])
        break

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I need your assistance with a word game in which we need to find a 5-letter word using a clue, a guess and an explanation for the guess.

For example, if the clue is "snow white" and the guess is "apple", you will get the information like this:
clue:snow white
guess:apple
explanation:the fairy tale Snow White involves the girl eating a poisoned apple, which has 5 letters.

Your task is to either agree or disagree with my guess based on the given clue.

If you disagree, I will make another guess, and if you agree, I will proceed in the game using the same guess.

Initially, guess feedback is not available, but as the game proceeds, it becomes available. The guess feedback is not for current guess, but rather an indication of what letters can be present in the current guess. A letter highlighted in green indicates that it is a correct letter in the correct position, while yellow indicates that it is a correct letter in the wrong 

In [133]:
USER: str = 'user'
ASSISTANT: str = 'assistant'

path_interactions: str = './v1.6/claude-3-5-sonnet-20240620-t0.0--claude-3-5-sonnet-20240620-t0.0/imagegame/0_compact_grids/episode_0/interactions.json'

data: dict = prepare_instance_data(path_interactions)

In [139]:
chat_p1: list = []
chat_p2: list = []

for turn in data['turns']:
    for action in turn:
        if action['to'] == 'Player 1':
            chat_p1.append({
                'role': USER,
                'content': action['action']['content'],
                'type': action['action']['type'],
            })
        if action['to'] == 'Player 2':
            chat_p2.append({
                'role': USER,
                'content': action['action']['content'],
                'type': action['action']['type'],
            })

        if action['from'] == 'Player 1':
            chat_p1.append({
                'role': ASSISTANT,
                'content': action['action']['content'],
            })
        if action['from'] == 'Player 2':
            chat_p2.append({
                'role': ASSISTANT,
                'content': action['action']['content'],
            })            

In [140]:
chat_p1

[{'role': 'user',
  'content': 'Let us do the following task. The goal is to fill an empty grid that looks like this:\n\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n\nA filled grid below is 5 by 5 and can look like this:\n\n▢ ▢ ▢ ▢ ▢\n▢ ▢ E ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\nX X X X X\n\nI want you to describe this grid to me, step by step. You don\'t need to describe the empty squares, which are denoted with "▢". Only describe the location of letters in the grid. Then you wait for me to say "What is your next instruction?", and then you continue with the next step. Take the size of the grid into consideration while giving instructions. When you have described everything, you say "DONE".\n\nFor the filled grid above, here are the example steps.\n\nWhat is your next instruction?\nInstruction: Put an E in second row third column\n\nWhat is your next instruction?\nInstruction: Fill the last row with X\n\nWhat is your next instruction?\nInstruction: DONE\n\nOk. Please do this for the fo

In [141]:
chat_p2

[{'role': 'user',
  'content': 'Let us draw something together. There is an empty grid with a size 5 by 5, like so:\n\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n\nI will give you instructions like "put an X in the top left", and you return the grid by applying the given instruction, like so:\n\nInstruction: put an X in the top left\n\nX ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n\nNow create an empty grid with a size 5 by 5 and execute the following commands at each step. Once you execute the command, return only the grid and exclude all other text from the output.\nInstruction: Put a B in the first row, fifth column',
  'type': 'send message'},
 {'role': 'assistant',
  'content': '▢ ▢ ▢ ▢ B\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢'},
 {'role': 'user',
  'content': 'Instruction: Put a B in the second row, fourth column',
  'type': 'send message'},
 {'role': 'assistant',
  'content': '▢ ▢ ▢ ▢ B\n▢ ▢ ▢ B ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢\n▢ ▢ ▢ ▢ ▢'},
 {'role': 'user',
  