In [1]:
import pandas as pd
from pandas import DataFrame
import os
import json

# info
Has a key that indicates whether a game was aborted or not

two staged game:
1. The questioner asks for specific information
2. the answerer is asked if the questioner knows specific key information already or not.

The main focus is on the probes (2) not the conversational stage (1).

For knowledge distillation, both sides can be considered and included in a dataset.

### Turns:
in each turn object, there is a full round of probing.
A turn begins with the questioner asking a questions to the answerer.
Afterwards there is a full round of probing all probing questions.

In [2]:
top_level_directories: list = ['v0.9', 'v1.0', 'v1.5', 'v1.5_quantized', 'v1.6', 'v1.6_backends', 'v1.6_quantized']

game: str = 'privateshared'

# Change the variants according to the game above
game_variants: list = ['3_things-places',  '2_restaurant',  '0_travel-booking', '1_job-interview', '4_letter-number']

In [3]:
os.listdir('./v0.9/claude-v1.3-t0.0--claude-v1.3-t0.0/privateshared')

['3_things-places',
 '2_restaurant',
 '0_travel-booking',
 '1_job-interview',
 '4_letter-number']

In [4]:
df_dict: dict = {
    'benchmark_version': [],
    'model_id': [],
    'difficulty': [],
    'number_of_turns': [],
    'end_of_game_type': [],
    'episode_number': [],
    'accuracy': [],
}

In [5]:
for top_level_directory in top_level_directories:
    # loop over all directories with all variants of the benchmark
    all_available_models: list = os.listdir(f'./{top_level_directory}')
    # filter all files that are not a directory
    all_available_models = [model_name for model_name in all_available_models if os.path.isdir(f'./{top_level_directory}/{model_name}')]
    for model_name in all_available_models:
        # for each model pick all episodes from the taboo game folder
        for game_variant in game_variants:
            path_to_episodes: str = f'./{top_level_directory}/{model_name}/{game}/{game_variant}'
            if not  os.path.isdir(path_to_episodes):
                continue

            all_episodes: list =  os.listdir(path_to_episodes)
            # filter all file names that do not start with episode to ensure to not read files without episodes
            all_episodes = [episode for episode in all_episodes if episode.startswith('episode')]

            # loop over all episodes to check whether an episode ended with success or any problems
            for episode in all_episodes:
                path_interactions: str = f'./{top_level_directory}/{model_name}/{game}/{game_variant}/{episode}/interactions.json'
                path_episode_scores: str = f'./{top_level_directory}/{model_name}/{game}/{game_variant}/{episode}/scores.json'

                if os.path.isfile(path_interactions) and os.path.isfile(path_episode_scores):
                    # open the interaction.json file to read all interactions
                    with open(path_interactions,  encoding="utf8") as json_file:
                        interactions: dict = json.load(json_file)
                        episode_scores: DataFrame = pd.read_json(path_episode_scores)

                        # get game ending
                        game_ending: str =  'aborted' if interactions['Aborted'] else 'success'

                        # For privateshared, there is no target word.
                        accuracy: float = episode_scores['episode scores'].loc['Accuracy']

                        # add information to df_dict
                        df_dict['benchmark_version'].append(top_level_directory)
                        df_dict['model_id'].append(model_name)
                        df_dict['difficulty'].append(game_variant)
                        df_dict['number_of_turns'].append(len(interactions['turns']))
                        df_dict['end_of_game_type'].append(game_ending)
                        df_dict['episode_number'].append(episode)
                        df_dict['accuracy'].append(accuracy)

  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_json(path_episode_scores)
  episode_scores: DataFrame = pd.read_js

In [6]:
for key in df_dict:
    print(key, len(df_dict[key]))

benchmark_version 8697
model_id 8697
difficulty 8697
number_of_turns 8697
end_of_game_type 8697
episode_number 8697
accuracy 8697


In [7]:
data_df: DataFrame = DataFrame(data=df_dict)

In [8]:
data_df.head()

Unnamed: 0,benchmark_version,model_id,difficulty,number_of_turns,end_of_game_type,episode_number,accuracy
0,v0.9,luminous-supreme-t0.0--luminous-supreme-t0.0,3_things-places,1,aborted,episode_5,
1,v0.9,luminous-supreme-t0.0--luminous-supreme-t0.0,3_things-places,1,aborted,episode_2,
2,v0.9,luminous-supreme-t0.0--luminous-supreme-t0.0,3_things-places,1,aborted,episode_3,
3,v0.9,luminous-supreme-t0.0--luminous-supreme-t0.0,3_things-places,1,aborted,episode_4,
4,v0.9,luminous-supreme-t0.0--luminous-supreme-t0.0,3_things-places,1,aborted,episode_1,


In [9]:
print('| Benchmark Version          | Number of Episodes | Number of Successfully Played Episodes | Number of turns | Accuracy | Perfect Episodes (acc 1.0) | Perfect Turns (Episode acc=1.0) |')
print('|----------------------------|--------------------|----------------------------------------|-----------------|------------------------|---------------|---------------|')
for benchmark in top_level_directories:
    data: DataFrame = data_df[data_df.benchmark_version == benchmark]
    num_episodes: int = len(data)
    number_wins: int = len(data[data.end_of_game_type == 'success'])
    number_turns: int = data[data.end_of_game_type == 'success'].number_of_turns.sum()
    accuracy: int = data[data.end_of_game_type == 'success'].accuracy.mean()
    perfect_episodes: int = len(data[(data.end_of_game_type == 'success') & (data.accuracy == 1.0)])
    perfect_turns: int = data[(data.end_of_game_type == 'success') & (data.accuracy == 1.0)].number_of_turns.sum()
    print(f'|{benchmark}|{num_episodes}|{number_wins}|{number_turns}|{accuracy*100:.2f}|{perfect_episodes}|{perfect_turns}|')

| Benchmark Version          | Number of Episodes | Number of Successfully Played Episodes | Number of turns | Accuracy | Perfect Episodes (acc 1.0) | Perfect Turns (Episode acc=1.0) |
|----------------------------|--------------------|----------------------------------------|-----------------|------------------------|---------------|---------------|
|v0.9|449|139|1234|85.99|4|24|
|v1.0|2048|695|6485|74.76|46|366|
|v1.5|1900|761|7056|78.16|69|529|
|v1.5_quantized|400|112|1212|73.13|5|30|
|v1.6|3300|1537|14062|79.87|146|1101|
|v1.6_backends|300|291|2646|78.93|21|126|
|v1.6_quantized|300|279|2469|73.38|11|66|
