In [1]:
import pandas as pd
from pandas import DataFrame

# Notes:

The file /interactions contains the complete game interactions of all players in each episode.

A game is stored as JSON object with on key that is **turn** which contains an array of all turns.
The last turn indicates whether a game ended successful or not.
The turn object contains an action object with the key type. This action type can have the following values:
- "correct guess" -> indicates that a game ended with a correct guess of the player
- "invalid format" -> indicates that a game ended because the answer format was incorrect. This can happen when a related word was used or the output is not in the CLUE: \< ... \> Format
- "max turns reached" -> indicates that a game ended because the maximum amount of turns is reached

In [15]:
base_path_llama3_70B: str = './v1.6/Meta-Llama-3-70B-Instruct-hf-t0.0--Meta-Llama-3-70B-Instruct-hf-t0.0/'
game: str = 'taboo/2_low_en/'
episode: str = 'episode_0/'

instance_data: DataFrame = pd.read_json(base_path_llama3_70B + game + episode + 'instance.json')
request_data: DataFrame = pd.read_json(base_path_llama3_70B + game + episode + 'requests.json')

In [16]:
instance_data.head()

Unnamed: 0,game_id,target_word,related_word,target_word_stem,related_word_stem
0,0,autograph,signature,autograph,signatur
1,0,autograph,hand,autograph,hand
2,0,autograph,sign,autograph,sign


In [4]:
request_data.head()

Unnamed: 0,timestamp,manipulated_prompt_obj,raw_response_obj
0,2024-05-28 09:08:55.997695,{'inputs': '<|begin_of_text|><|start_header_id...,{'response': '<|begin_of_text|><|start_header_...
1,2024-05-28 09:09:00.340713,{'inputs': '<|begin_of_text|><|start_header_id...,{'response': '<|begin_of_text|><|start_header_...


In [5]:
print(request_data.manipulated_prompt_obj.iloc[1]['inputs'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are playing a collaborative word guessing game in which you have to guess a target word that another player describes to you.

You can make one guess at each trial. You win when you guess the target word. You lose when you cannot guess it in 3 tries.

After each trial you will get a new hint from the other player which starts with CLUE.

Make your guesses by just saying the word using the following form: GUESS: <a word>

Let us start.


CLUE: Written confirmation of identity by a famous person.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [6]:
print(request_data.raw_response_obj.iloc[1]['response'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are playing a collaborative word guessing game in which you have to guess a target word that another player describes to you.

You can make one guess at each trial. You win when you guess the target word. You lose when you cannot guess it in 3 tries.

After each trial you will get a new hint from the other player which starts with CLUE.

Make your guesses by just saying the word using the following form: GUESS: <a word>

Let us start.


CLUE: Written confirmation of identity by a famous person.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

GUESS: Autograph<|eot_id|>


In [23]:
import os
import plotly.graph_objects as go
import json

In [24]:
top_level_directories: list = ['v0.9', 'v1.0', 'v1.5', 'v1.5_quantized', 'v1.6', 'v1.6_backends', 'v1.6_quantized']
game: str = 'taboo'
game_variants: list = ['0_high_en', '1_medium_en', '2_low_en']

In [31]:
# Store general information about the amount of episodes and how they ended
episodes: dict = {
    'available_episodes': 0,
    '0_high_en': {
        'total_episodes': 0,
        'correct guess': 0,
        'invalid format': 0,
        'max turns reached': 0,
        'invalid clue': 0,
    },
    '1_medium_en': {
        'total_episodes': 0,
        'correct guess': 0,
        'invalid format': 0,
        'max turns reached': 0,
        'invalid clue': 0,
    },
    '2_low_en': {
        'total_episodes': 0,
        'correct guess': 0,
        'invalid format': 0,
        'max turns reached': 0,
        'invalid clue': 0,
    },
}

# collect information about the different game domains
unique_game_domains: dict = {}

# collect information in dataframe format
df_dict: dict = {
    'benchmark_version': [],
    'model_id': [],
    'difficulty': [],
    'number_of_turns': [],
    'end_of_game_type': [],
    'episode_number': [],
    'target_word': [],
}

for top_level_directory in top_level_directories:
    # loop over all directories with all variants of the benchmark
    all_available_models: list = os.listdir(f'./{top_level_directory}')
    # filter all files that are not a directory
    all_available_models = [model_name for model_name in all_available_models if os.path.isdir(f'./{top_level_directory}/{model_name}')]
    for model_name in all_available_models:
        # for each model pick all episodes from the taboo game folder
        for game_variant in game_variants:
            all_episodes: list =  os.listdir(f'./{top_level_directory}/{model_name}/{game}/{game_variant}')
            # filter all file names that do not start with episode to ensure to not read files without episodes
            all_episodes = [episode for episode in all_episodes if episode.startswith('episode')]
            
            # count the number of all episodes
            episodes['available_episodes'] += len(all_episodes)
            # count the number of episodes of a particular game variant
            episodes[game_variant]['total_episodes'] += len(all_episodes)
            
            # loop over all episodes to check whether an episode ended with success or any problems
            for episode in all_episodes:
                path_interactions: str = f'./{top_level_directory}/{model_name}/{game}/{game_variant}/{episode}/interactions.json'
                path_instances: str = f'./{top_level_directory}/{model_name}/{game}/{game_variant}/{episode}/instance.json'

                if os.path.isfile(path_interactions) and os.path.isfile(path_instances):
                    # open the interaction.json file to read all interactions
                    with open(path_interactions,  encoding="utf8") as json_file:
                        interactions: dict = json.load(json_file)
                        
                        # get the last turn of the interaction
                        last_turn: list = interactions['turns'][-1]
                        
                        # get game ending 
                        game_ending: str = last_turn[-1]['action']['type']
                        
                        # add game ending
                        episodes[game_variant][game_ending] += 1

                        # add information to df_dict
                        df_dict['benchmark_version'].append(top_level_directory)
                        df_dict['model_id'].append(model_name)
                        df_dict['difficulty'].append(game_variant)
                        df_dict['number_of_turns'].append(len(interactions['turns']))
                        df_dict['end_of_game_type'].append(game_ending)
                        df_dict['episode_number'].append(episode)

                    # open the instance.json file to collect information about the domain of the game
                    instance_data: DataFrame = pd.read_json(path_instances)
                    target_word: str = instance_data.target_word.iloc[0]

                    df_dict['target_word'].append(target_word)

In [42]:
data_df: DataFrame = DataFrame(data=df_dict)
data_df[data_df.end_of_game_type == 'correct guess'].target_word.value_counts()

target_word
obvious           79
responsibility    78
shark             78
filter            77
regret            76
                  ..
slovak             4
mark               4
cameroon           2
statute            2
diy                1
Name: count, Length: 117, dtype: int64

In [50]:
print(f'There is a total of: {len(data_df)} episodes available')
print(f'With a total of {len(data_df.target_word.unique())} unique target words')
print(f'and a total of {data_df.number_of_turns.sum()} played turns. Each turn is a pair of Clue and Guess.')
print()
print(f'Within those {len(data_df)} episodes, only {len(data_df[data_df.end_of_game_type == "correct guess"])} were finished successfully')
print(f'{len(data_df[data_df.end_of_game_type == "invalid format"])} ended due to invalid answer formatting, {len(data_df[data_df.end_of_game_type == "max turns reached"])} ended while the maximum umber of turns was reached, and {len(data_df[data_df.end_of_game_type == "invalid clue"])} ended because invalid clues were provided.')
print()
print(f'Within the {len(data_df[data_df.end_of_game_type == "correct guess"])} sucessfully played games, there is a total of {len(data_df[data_df.end_of_game_type == "correct guess"].target_word.value_counts())} unique target words and a total of {data_df[data_df.end_of_game_type == "correct guess"].number_of_turns.sum()} played turns')

There is a total of: 9843 episodes available
With a total of 119 unique target words
and a total of 14825 played turns. Each turn is a pair of Clue and Guess.

Within those 9843 episodes, only 3481 were finished successfully
3655 ended due to invalid answer formatting, 580 ended while the maximum umber of turns was reached, and 2127 ended because invalid clues were provided.

Within the 3481 sucessfully played games, there is a total of 117 unique target words and a total of 4203 played turns


In [69]:
for benchmark_version in top_level_directories:
    all_data: DataFrame = data_df[data_df.benchmark_version == benchmark_version]

    print(f'***** Benchmark Version {benchmark_version} *****')
    print(f'Number of Episodes: {len(all_data): > 30}')
    print(f'Number of Successfully Played Episodes: {len(all_data[all_data.end_of_game_type == "correct guess"]): > 10}')
    print(f'Number of turns: {all_data[all_data.end_of_game_type == "correct guess"].number_of_turns.sum(): > 33}')
    print(f'Number of target words: {len(all_data[all_data.end_of_game_type == "correct guess"].target_word.value_counts()): > 25}')

***** Benchmark Version v0.9 *****
Number of Episodes:                            644
Number of Successfully Played Episodes:        193
Number of turns:                               226
Number of target words:                        52
***** Benchmark Version v1.0 *****
Number of Episodes:                           2419
Number of Successfully Played Episodes:        641
Number of turns:                               739
Number of target words:                        57
***** Benchmark Version v1.5 *****
Number of Episodes:                           2400
Number of Successfully Played Episodes:        909
Number of turns:                              1068
Number of target words:                        60
***** Benchmark Version v1.5_quantized *****
Number of Episodes:                            480
Number of Successfully Played Episodes:        131
Number of turns:                               163
Number of target words:                        51
***** Benchmark Version v1.6 *****
Num

| Benchmark Version          | Number of Episodes | Number of Successfully Played Episodes | Number of turns | Number of target words |
|----------------------------|--------------------|----------------------------------------|-----------------|------------------------|
| v0.9                       | 644                | 193                                    | 226             | 52                     |
| v1.0                       | 2419               | 641                                    | 739             | 57                     |
| v1.5                       | 2400               | 909                                    | 1068            | 60                     |
| v1.5_quantized             | 480                | 131                                    | 163             | 51                     |
| v1.6                       | 3180               | 1221                                   | 1488            | 60                     |
| v1.6_backends              | 360                | 224                                    | 326             | 53                     |
| v1.6_quantized             | 360                | 162                                    | 193             | 49                     |


In [70]:
df_dict: dict = {
    'game_id': [],
    'benchmark_version': [],
    'model_id': [],
    'difficulty': [],
    'number_of_turns': [],
    'end_of_game_type': [],
    'episode_number': [],
    'target_word': [],
}

In [71]:
# define data
data = episodes

# Define nodes
nodes = {
    "label": ["Total Episodes of Taboo Played", "Category: High English", "Category: Medium English", "Category: Low English", "Game Ending: Correct Guess", "Game Ending: Invalid Format", "Game Ending: Max Turns Reached", "Game Ending: Invalid Clue"],
    "color": ["#a6dced", "#a6dced", "#a6dced", "#a6dced", "#89c289", "#f2a1a5", "#f2a1a5", "#f2a1a5"]
}

# Define links
links = {
    "source": [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
    "target": [1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7],
    "value": [
        data['0_high_en']['total_episodes'],
        data['1_medium_en']['total_episodes'],
        data['2_low_en']['total_episodes'],
        data['0_high_en']['correct guess'],
        data['0_high_en']['invalid format'],
        data['0_high_en']['max turns reached'],
        data['0_high_en']['invalid clue'],
        data['1_medium_en']['correct guess'],
        data['1_medium_en']['invalid format'],
        data['1_medium_en']['max turns reached'],
        data['1_medium_en']['invalid clue'],
        data['2_low_en']['correct guess'],
        data['2_low_en']['invalid format'],
        data['2_low_en']['max turns reached'],
        data['2_low_en']['invalid clue']
    ],
    "color": [
        "#a6dced", "#a6dced", "#a6dced",
        "#89c289", "#f2a1a5", "#f2a1a5", "#f2a1a5",
        "#89c289", "#f2a1a5", "#f2a1a5", "#f2a1a5",
        "#89c289", "#f2a1a5", "#f2a1a5", "#f2a1a5"
    ]
}

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes["label"],
        color=nodes["color"],
    ),
    link=dict(
        source=links["source"],  # indices correspond to labels in the nodes list
        target=links["target"],
        value=links["value"],
        color=links["color"],
    )
)])

fig.update_layout(title_text="Sankey Diagram of Game Episodes of Taboo from Start to Ending", font_size=15)
fig.write_html("sankey_diagram_episodes_taboo.html")
fig.show()