# Experiment 5 - Ephemerality in complex games

In [None]:
! tar czf Experiments.tar *

In [None]:
! nvidia-smi

In [1]:
import pickle
import random

In [2]:
import sys
sys.path.append("../Self-evaluation")

from self_evaluation import play, LLMAgentSelfEvaluate

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Notebook variables

In [None]:
exp_number = 5

seeds = range(1,51)
n_episodes = 1
max_steps = 100

In [None]:
with open(f"./Experiment 1/max_scores.pickle", "rb") as f:
    max_scores_simple = pickle.load(f)
    print("Data loaded.")

max_scores_simple = max_scores_simple[:len(seeds)]
max_score_cooking = 5

## Game running

In [None]:
n_think_indices = [4, 10]
n_episodes = 1
max_steps = 100

### 0 and e1

In [None]:
# these are not needed
game = "simple"

results_all_seeds = []
for seed in seeds:
    results = play(LLMAgentSelfEvaluate(selfeval_turns=0,
                                        verbose=False,
                                        log=f"./Experiment {exp_number}/logs/{game}/0think-seed{seed}.log"
                                        ),
                    f"games/{game}/seed{seed}.z8",
                    max_steps=max_steps,
                    n_episodes=n_episodes)
    results_all_seeds.append(results[0]) # only one episode
with open(f"./Experiment {exp_number}/0think_{game}.pickle", "wb") as f:
    pickle.dump(results_all_seeds, f) # so we dump already with grouping by seed!!
    print("Data pickled.")

results_all_seeds = []
for seed in seeds:
    results = play(LLMAgentSelfEvaluate(selfeval_turns=1,
                                        reads_own_reasoning=False,
                                        verbose=False,
                                        log=f"./Experiment {exp_number}/logs/{game}/e1think-seed{seed}.log"
                                        ),
                    f"games/{game}/seed{seed}.z8",
                    max_steps=max_steps,
                    n_episodes=n_episodes)
    results_all_seeds.append(results[0]) # only one episode
with open(f"./Experiment {exp_number}/e1think_{game}.pickle", "wb") as f:
    pickle.dump(results_all_seeds, f) # so we dump already with grouping by seed!!
    print("Data pickled.")

In [None]:
game = "cooking"

results_all_seeds = []
for seed in seeds:
    results = play(LLMAgentSelfEvaluate(selfeval_turns=0,
                                        verbose=False,
                                        log=f"./Experiment {exp_number}/logs/{game}/0think-level1-seed{seed}.log"
                                        ),
                    f"games/{game}/level1-seed{seed}.z8",
                    max_steps=max_steps,
                    n_episodes=n_episodes)
    results_all_seeds.append(results[0]) # only one episode
with open(f"./Experiment {exp_number}/0think_{game}.pickle", "wb") as f:
    pickle.dump(results_all_seeds, f)
    print("Data pickled.")

results_all_seeds = []
for seed in seeds:
    results = play(LLMAgentSelfEvaluate(selfeval_turns=1,
                                        reads_own_reasoning=False,
                                        verbose=False,
                                        log=f"./Experiment {exp_number}/logs/{game}/e1think-level1-seed{seed}.log"
                                        ),
                    f"games/{game}/level1-seed{seed}.z8",
                    max_steps=max_steps,
                    n_episodes=n_episodes)
    results_all_seeds.append(results[0]) # only one episode
with open(f"./Experiment {exp_number}/e1think_{game}.pickle", "wb") as f:
    pickle.dump(results_all_seeds, f)
    print("Data pickled.")

### The others

In [None]:
game = "simple"

for n in n_think_indices:
    for ephemeral in [False, True]:
        results_all_seeds = []
        for seed in seeds:
            results = play(LLMAgentSelfEvaluate(selfeval_turns=n,
                                                reads_own_reasoning=ephemeral,
                                                random_selfeval=True,
                                                verbose=False,
                                                log=f"./Experiment {exp_number}/logs/{game}/{'e' if ephemeral else ''}r{n}think-seed{seed}.log"
                                                ),
                            f"games/{game}/seed{seed}.z8", max_steps=max_steps, n_episodes=n_episodes)
            results_all_seeds.append(results[0]) # only one episode
        with open(f"./Experiment {exp_number}/{'e' if ephemeral else ''}r{n}think_{game}.pickle", "wb") as f:
            pickle.dump(results_all_seeds, f)
            print("Data pickled.")

In [None]:
game = "cooking"

for n in n_think_indices:
    for ephemeral in [False, True]:
        results_all_seeds = []
        for seed in seeds:
            results = play(LLMAgentSelfEvaluate(selfeval_turns=n,
                                                reads_own_reasoning=ephemeral,
                                                random_selfeval=True,
                                                verbose=False,
                                                log=f"./Experiment {exp_number}/logs/{game}/{'e' if ephemeral else ''}r{n}think-level1-seed{seed}.log"
                                                ),
                            f"games/{game}/level1-seed{seed}.z8", max_steps=max_steps, n_episodes=n_episodes)
            results_all_seeds.append(results[0]) # only one episode
        with open(f"./Experiment {exp_number}/{'e' if ephemeral else ''}r{n}think_{game}_level1.pickle", "wb") as f:
            pickle.dump(results_all_seeds, f)
            print("Data pickled.")

# Data aggregation for visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import bootstrap

In [None]:
final_arrays = {"simple": [], "cooking": []}

# 0/e1

with open(f"./Experiment {exp_number}/0think_simple.pickle", "rb") as f:
    results_all_seeds = pickle.load(f)
final_scores = [run[-1][1] for run in results_all_seeds]
final_scores_normalized = np.array(final_scores) / np.array(max_scores_simple)
final_arrays["simple"].append(final_scores_normalized)
with open(f"./Experiment {exp_number}/e1think_simple.pickle", "rb") as f:
    results_all_seeds = pickle.load(f)
final_scores = [run[-1][1] for run in results_all_seeds]
final_scores_normalized = np.array(final_scores) / np.array(max_scores_simple)
final_arrays["simple"].append(final_scores_normalized)

with open(f"./Experiment {exp_number}/0think_cooking_level1.pickle", "rb") as f:
    results_all_seeds = pickle.load(f)
final_scores = [run[-1][1] for run in results_all_seeds]
final_scores_normalized = np.array(final_scores) / max_score_cooking
final_arrays["cooking"].append(final_scores_normalized)
with open(f"./Experiment {exp_number}/e1think_cooking_level1.pickle", "rb") as f:
    results_all_seeds = pickle.load(f)
final_scores = [run[-1][1] for run in results_all_seeds]
final_scores_normalized = np.array(final_scores) / max_score_cooking
final_arrays["cooking"].append(final_scores_normalized)

# simple 4/12
for n in n_think_indices:
    for ephemeral in [False, True]:
        results_all_seeds = []
        with open(f"./Experiment {exp_number}/{'e' if ephemeral else ''}r{n}think_simple.pickle", "rb") as f:
            results_all_seeds = pickle.load(f)
        final_scores = [run[-1][1] for run in results_all_seeds]
        final_scores_normalized = np.array(final_scores) / np.array(max_scores_simple)
        final_arrays["simple"].append(final_scores_normalized)

# cooking 4/12
for n in n_think_indices:
    for ephemeral in [False, True]:
        results_all_seeds = []
        with open(f"./Experiment {exp_number}/{'e' if ephemeral else ''}r{n}think_cooking_level1.pickle", "rb") as f:
            results_all_seeds = pickle.load(f)
        final_scores = [run[-1][1] for run in results_all_seeds]
        final_scores_normalized = np.array(final_scores) / max_score_cooking
        final_arrays["cooking"].append(final_scores_normalized)

# Visualization

In [27]:
figsize=(12,20)

In [None]:
colors3 = ['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3', '#937860', '#da8bc3', '#8c8c8c', '#ccb974', '#64b5cd'] #matplotlib

In [None]:
custom_palette1 = []

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

data_combined = []
for key in final_arrays.keys():
    for array in final_arrays[key]:
        data_combined = np.concatenate(data_combined, array)
array_length = len(final_arrays["simple"][0])

# only for reference
grouping_labels = ["Simple", "Cooking"] 
subgrouping_labels = ["0-think", "e1-think", "4-think", "e4-think", "12-think", "e12-think"]

grouping = ["Simple"] * len(final_arrays["simple"])*array_length + ["Cooking"] * len(final_arrays["simple"])*array_length
subgrouping = (["0-think"] * array_length + ["e1-think"] * array_length + ["4-think"] * array_length + ["e4-think"] * array_length + ["12-think"] * array_length + ["e12-think"] * array_length) * 2

swarm = sns.swarmplot(
    x=grouping, hue=subgrouping, y=data_combined,
    ax=ax,
    palette=custom_palette2,
    size=3,
    dodge=True,
    legend=False
)

box = sns.boxplot(x=grouping, hue=subgrouping, y=data_combined,
                  boxprops={"alpha": 0.1},
                  showfliers=False,
                  width=0.8, gap=0.2,
                  palette=custom_palette1,
                  showmeans=True,
                  medianprops={"color": "black", "linewidth": 2, "alpha": 0.5},
                  legend=True
                  )

scatter1 = sns.swarmplot(x=grouping_means_medians, hue=subgrouping_medians, y=medians,
                        legend=True,
                        dodge=True,
                         palette=custom_palette3,
                         marker="X",
                         size=10
                        )
scatter1 = sns.swarmplot(x=grouping_means_medians, hue=subgrouping_means, y=means,
                        legend=True,
                        dodge=True,
                        palette=custom_palette3,
                         marker="^",
                         size=10
                        )

# ax.set_title(f"Score comparison between fixed and random $n$-think")
ax.set_ylabel("normalized final score")
plt.show()