In [1]:
import pandas as pd
import json
import os
from json import JSONDecodeError

## Extraction Info

### raw.csv
for each benchmark there is the raw.csv containing all metrics collected.
from this file, the following information can be extracted for each episode:
- game
- model
- episode
- experiment
- Aborted
- Lose
- Success

### requests.json
For each benchmark > model > game > experiment > episode there is a requests file containing the requests sent to the models and the received answers
following information can be extracted for each episode
- inputs
- responses

### instance.json
For each benchmark > model > game > experiment > episode there is a instance.json file containing metadata for the episode
e.g. for taboo it is the target word along with the taboo words or for wordle it is the target word along with a clue for the word.
the metadata will be extracted completely and stored as an additional column.


In [2]:
def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

## Prepare Taboo Data

In [3]:
def prepare_taboo_interaction_data(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list = []
    
    # loop over all turns
    for i, turn in enumerate(data['turns']):
        
        # loop over all actions inside one turn
        for action in turn:
            if action['to'] == 'Player 1' and action['from'] == 'GM' and action['action']['type'] == "send message":
                chat_p1.append({'role': USER, 'content': action['action']['content']})
            elif action['to'] == 'Player 2' and action['from'] == 'GM' and action['action']['type'] == "send message":
                chat_p2.append({'role': USER, 'content': action['action']['content']})
            elif action['to'] == 'GM' and action['from'] == 'Player 1' and action['action']['type'] == "get message":
                chat_p1.append({'role': ASSISTANT, 'content': action['action']['content']})
            elif action['to'] == 'GM' and action['from'] == 'Player 2' and action['action']['type'] == "get message":
                chat_p2.append({'role': ASSISTANT, 'content': action['action']['content']})
                
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_taboo_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    # fill in the taboo specifics
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['benchmark_version'].append(bv)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_word'].append(instance_data['target_word'])
    _data['related_words'].append(instance_data['related_word'])
    return _data

## Prepare wordle no critic no clue Data

In [15]:
def prepare_wordle_no_clue_no_critic(data: dict) -> dict:
    chat: list = []
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if action['from'] == 'GM' and action['to'] == 'Player 1' and action['action']['type'] == "send message":
                chat.append({'role': USER, 'content': action['action']['content'], 'has_error': False})
            if action['from'] == 'Player 1' and action['to'] == 'GM' and action['action']['type'] == "get message":
                chat.append({'role': ASSISTANT, 'content': action['action']['content'], 'has_error': False})
                
            if  action['from'] == 'GM' and action['to'] == 'GM' and action['action']['type'] == "metadata":
                try:
                    error: str = action['action']['game_info']['error']
                    print(error)
                    if error:
                        chat[-1]['has_error'] = True
                except KeyError:  
                    pass
            
                    
    return {'chat': chat}

def update_wordle_no_clue_no_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['benchmark_version'].append(bv)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat'].append(game_data['chat'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    _data['target_word_clue'].append(instance_data['target_word_clue'])

    return _data

## prepare wordle with clue and critic

In [42]:
def prepare_wordle_clue_and_critic(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list  = []

    last_inserted: str = ''
    
    for i, turn in enumerate(data['turns']):
        for action in turn:
            if action['action']['type'] == 'send message' and action['to'] == 'Player 2':
                chat_p2.append( { 'role': USER,  'content': action['action']['content'], 'has_error': False })
                last_inserted = 'Player 2'
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 2':
                chat_p2.append( {'role': ASSISTANT, 'content': action['action']['content'], 'has_error': False })
                last_inserted = 'Player 2'
                continue
    
            if action['action']['type'] == 'send message' and action['to'] == 'Player 1':
                chat_p1.append( { 'role': USER,  'content': action['action']['content'], 'has_error': False })
                last_inserted = 'Player 1'
                continue
    
            if action['action']['type'] == 'get message' and action['from'] == 'Player 1':
                chat_p1.append( { 'role': ASSISTANT,  'content': action['action']['content'], 'has_error': False})

                last_inserted = 'Player 1'
                continue

            if  action['from'] == 'GM' and action['to'] == 'GM' and action['action']['type'] == "metadata":
                try:
                    error: str = action['action']['game_info']['error']
                    print(error)
                    if error:
                        if last_inserted == 'Player 1':
                            chat_p1[-1]['has_error'] = True
                        else:
                            chat_p2[-1]['has_error'] = False
                except KeyError:
                    pass
                
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_wordle_clue_and_critic_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_word'].append(instance_data['target_word'])
    _data['target_word_difficulty'].append(instance_data['target_word_difficulty'])
    
    return _data
    

## Prepare reference game data

In [43]:
def prepare_reference_game_data(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list = []
    
    USER: str = 'user'
    ASSISTANT: str = 'assistant'
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content']
                })
            if action['to'] == 'Player 2':
                chat_p2.append({
                    'role': USER,
                    'content': action['action']['content']
                })
    
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content']
                })
            if action['from'] == 'Player 2':
                chat_p2.append({
                    'role': ASSISTANT,
                    'content': action['action']['content']
                })
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_reference_game_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_grid_name'].append(instance_data['target_grid_name'])
    
    return _data


## Private Shared

In [44]:
def prepare_private_shared_data(data: dict) -> dict:
    chat_p1: list = []
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
                
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                    'type': action['action']['type'],
                })
    return {'chat_p1': chat_p1}

def update_private_shared(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['slots'].append(instance_data['slots'])
    
    return _data

## Prepare Image game

In [45]:
def prepare_image_game_data(data: dict) -> dict:
    chat_p1: list = []
    chat_p2: list = []
    
    for turn in data['turns']:
        for action in turn:
            if action['to'] == 'Player 1':
                chat_p1.append({
                    'role': USER,
                    'content': action['action']['content'],
                })
            if action['to'] == 'Player 2':
                chat_p2.append({
                    'role': USER,
                    'content': action['action']['content'],
                })
    
            if action['from'] == 'Player 1':
                chat_p1.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                })
            if action['from'] == 'Player 2':
                chat_p2.append({
                    'role': ASSISTANT,
                    'content': action['action']['content'],
                })
                
    return {'chat_p1': chat_p1, 'chat_p2': chat_p2}

def update_image_game_data(_data: dict, instance_data: dict, row: pd.DataFrame, game_data: dict, bv: str) -> dict:
    _data['game'].append(row.game)
    _data['benchmark_version'].append(bv)
    _data['game_id'].append(instance_data['game_id'])
    _data['model'].append(row.model)
    _data['experiment'].append(row.experiment)
    _data['episode'].append(row.episode)
    _data['Aborted'].append(row.Aborted)
    _data['Lose'].append(row.Lose)
    _data['Success'].append(row.Success)
    _data['chat_p1'].append(game_data['chat_p1'])
    _data['chat_p2'].append(game_data['chat_p2'])
    _data['target_grid'].append(instance_data['target_grid'])

    return _data

### Open JSON files

In [46]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data


In [55]:
benchmark_versions_old: list = ['v0.9', 'v1.0']

interaction_response_token_old: list = ['completion', 'response']
lookup_response_token_old: dict = {key: val for key, val in zip(benchmark_versions_old, interaction_response_token_old)}
benchmark_versions_new: list = ['v1.5', 'v1.5_quantized','v1.6','v1.6_backends','v1.6_quantized',]
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']
games: list = ['privateshared', 'referencegame', 'taboo', 'wordle', 'wordle_withclue', 'wordle_withcritic', 'imagegame']  # imagegame

In [56]:
taboo_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'benchmark_version': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_word': [],
    'related_words': []
}

wordle_no_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'benchmark_version': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat': [],
    'target_word': [],
    'target_word_difficulty': [],
    'target_word_clue': []
}

wordle_with_clue_no_critic_data: dict = {
    'game': [],
    'game_id': [],
    'model': [],
    'benchmark_version': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat': [],
    'target_word': [],
    'target_word_difficulty': [],
    'target_word_clue': []
}

wordle_with_clue_with_critic_data: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_word': [],
    'target_word_difficulty': [],
}

reference_game: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_grid_name': []
}

privateshared: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'slots': []
}

image_game: dict = {
    'game': [],
    'benchmark_version': [],
    'game_id': [],
    'model': [],
    'experiment': [],
    'episode': [],
    'Aborted': [],
    'Lose': [],
    'Success': [],
    'chat_p1': [],
    'chat_p2': [],
    'target_grid': []
}

In [57]:
extract_game_instance_data: dict = {
    'taboo': prepare_taboo_interaction_data,
    'wordle': prepare_wordle_no_clue_no_critic,
    'wordle_withclue': prepare_wordle_no_clue_no_critic,
    'wordle_withcritic': prepare_wordle_clue_and_critic,
    'referencegame': prepare_reference_game_data,
    'privateshared': prepare_private_shared_data,
    'imagegame': prepare_image_game_data,
}

update_game_data: dict = {
    'taboo': update_taboo_data,
    'wordle': update_wordle_no_clue_no_critic_data,
    'wordle_withclue': update_wordle_no_clue_no_critic_data,
    'wordle_withcritic': update_wordle_clue_and_critic_data,
    'referencegame': update_reference_game_data,
    'privateshared': update_private_shared,
    'imagegame': update_image_game_data,
}

data_collectors: dict = {
    'taboo': taboo_data,
    'wordle': wordle_no_clue_no_critic_data,
    'wordle_withclue': wordle_with_clue_no_critic_data,
    'wordle_withcritic': wordle_with_clue_with_critic_data,
    'referencegame': reference_game,
    'privateshared': privateshared,
    'imagegame': image_game,
}

In [58]:
USER = 'user'
ASSISTANT = 'assistant'

for benchmark_version in benchmark_versions_new:
    # read the raw_csv
    raw_csv_data: pd.DataFrame = pd.read_csv(f'./{benchmark_version}/raw.csv')
    
    # group by metric to obtain all episode information
    clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_csv_data, columns_to_keep=columns_to_keep_raw_csv)
    
    # loop over all entries and build paths
    for index, row in clean_csv_data.iterrows():
        
        # built paths
        path_requests_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/requests.json'
        path_instance_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/instance.json'
        path_interaction_json: str = f'./{benchmark_version}/{row.model}/{row.game}/{row.experiment}/{row.episode}/interactions.json'

        # check that paths are correctly built
        try:
            assert os.path.isfile(path_requests_json)
            assert os.path.isfile(path_instance_json)
            assert os.path.isfile(path_interaction_json)
        except AssertionError:
            print('assertion error in version', benchmark_version, ' on path_instance_json', path_instance_json, ' on path_requests_json', path_requests_json)
            continue
        
        instance_data: dict = prepare_instance_data(path=path_instance_json)

        try:
            interaction_data: dict = prepare_instance_data(path_interaction_json)
        except JSONDecodeError as e:
            print(e)
            print(path_instance_json)
            continue
        
        # get the data collector
        data_collector: dict = data_collectors[row.game]
        
        if not data_collector:
            print(row.game)
        
        # extract the game data
        
        game_data: dict = extract_game_instance_data[row.game](data=interaction_data)

        # add data to the data collector
        data_collectors[row.game] = update_game_data[row.game](_data=data_collector, instance_data=instance_data, row=row, game_data=game_data, bv=benchmark_version)
            

None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
None
None
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
INVALID_WORD
INVALID_WORD
INVALID_WORD
None
None
None
None
None
None
None
None
None
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
INVALID_WORD_LENGTH
INVALID_WORD
INVALID_WORD
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
No

In [59]:
d = pd.DataFrame(data=data_collectors['wordle_withcritic'])
len(d)

3791

In [61]:

base_path: str = './Data/Data_New'
for key in data_collectors.keys():
    save_dir: str = base_path + f'/{key}_new.jsonl'
    d = pd.DataFrame(data=data_collectors[key])
    print(save_dir)
    d.to_json(save_dir, orient='records', lines=True)

./Data/Data_New/taboo_new.jsonl
./Data/Data_New/wordle_new.jsonl
./Data/Data_New/wordle_withclue_new.jsonl
./Data/Data_New/wordle_withcritic_new.jsonl
./Data/Data_New/referencegame_new.jsonl
./Data/Data_New/privateshared_new.jsonl
./Data/Data_New/imagegame_new.jsonl


In [53]:
d.chat_p1.iloc[4]

[{'role': 'user',
  'content': 'You are a language wizard who likes to guess words by using the given rules.\n\nWelcome to Wordle! You have six attempts to guess the target word, a valid English word of five lowercase letters (a-z). Please use the tags "guess:" and "explanation:" to provide a concise explanation for each guess.\n\nTo help you make an informed guess, you will receive a clue for the word, such as\nclue: "snowy white"\n\nHere is an example guess based on the clue:\nguess: apple\nexplanation: In the fairy tail Snow White, the girl is killed because she eats a poisoned apple. And the word apple has 5 letters.\n\nI will then indicate whether I agree or disagree with your guess and provide rationale, but agreeing with a guess does not confirm its correctness. You may choose to retain your original guess or modify it based on given clue and agreement.\n\nAfter each guess, your answer will be validated, and you will receive feedback indicating which letters are correct (green),