# Test 4 - Comparison vs FM paper: CoinCollector, CookingGame, TreasureHunter

In [1]:
! tar czf Testing.tar *

In [None]:
! nvidia-smi

In [2]:
import pickle
import random

In [3]:
import sys
sys.path.append("../Self-evaluation")

from self_evaluation import play, LLMAgentSelfEvaluate

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Game generation

Idea:
- two models used: no-think and 10-think
- different difficulty levels: 6 for coin collector, ~5 for common sense,~ 5 for cooking, 8 for treasure hunter
- 5 seeds per case (vs 40 in FM paper), 1 episode per seed
- total episodes: 240 (190 without commonsense)

In [4]:
levels = {"coin_collector": 6, "cooking": 5, "treasure_hunter": 8}
seeds = range(1,6)

In [None]:
import subprocess

for game in levels.keys():
    if game == "cooking":
        continue
    for level in range(1, levels[game]+1):
        for seed in seeds:
            subprocess.run(["tw-make", f"tw-{game}", "--level", f"{level}", "--seed", f"{seed}", "--output", f"games/{game}/level{level}-seed{seed}.z8"]) 

In [8]:
!tw-make tw-cooking --seed 1 --go 1 --take 1 --recipe 1 --cut --cook --output games/cooking/level1-seed1.z8
!tw-make tw-cooking --seed 2 --go 1 --take 1 --recipe 1 --cut --cook --output games/cooking/level1-seed2.z8
!tw-make tw-cooking --seed 3 --go 1 --take 1 --recipe 1 --cut --cook --output games/cooking/level1-seed3.z8
!tw-make tw-cooking --seed 4 --go 1 --take 1 --recipe 1 --cut --cook --output games/cooking/level1-seed4.z8
!tw-make tw-cooking --seed 5 --go 1 --take 1 --recipe 1 --cut --cook --output games/cooking/level1-seed5.z8

!tw-make tw-cooking --seed 1 --go 1 --take 2 --recipe 2 --cut --cook --output games/cooking/level2-seed1.z8
!tw-make tw-cooking --seed 2 --go 1 --take 2 --recipe 2 --cut --cook --output games/cooking/level2-seed2.z8
!tw-make tw-cooking --seed 3 --go 1 --take 2 --recipe 2 --cut --cook --output games/cooking/level2-seed3.z8
!tw-make tw-cooking --seed 4 --go 1 --take 2 --recipe 2 --cut --cook --output games/cooking/level2-seed4.z8
!tw-make tw-cooking --seed 5 --go 1 --take 2 --recipe 2 --cut --cook --output games/cooking/level2-seed5.z8

!tw-make tw-cooking --seed 1 --go 1 --take 3 --recipe 3 --cut --cook --output games/cooking/level3-seed1.z8
!tw-make tw-cooking --seed 2 --go 1 --take 3 --recipe 3 --cut --cook --output games/cooking/level3-seed2.z8
!tw-make tw-cooking --seed 3 --go 1 --take 3 --recipe 3 --cut --cook --output games/cooking/level3-seed3.z8
!tw-make tw-cooking --seed 4 --go 1 --take 3 --recipe 3 --cut --cook --output games/cooking/level3-seed4.z8
!tw-make tw-cooking --seed 5 --go 1 --take 3 --recipe 3 --cut --cook --output games/cooking/level3-seed5.z8

!tw-make tw-cooking --seed 1 --go 6 --take 1 --recipe 1 --cut --cook --output games/cooking/level4-seed1.z8
!tw-make tw-cooking --seed 2 --go 6 --take 1 --recipe 1 --cut --cook --output games/cooking/level4-seed2.z8
!tw-make tw-cooking --seed 3 --go 6 --take 1 --recipe 1 --cut --cook --output games/cooking/level4-seed3.z8
!tw-make tw-cooking --seed 4 --go 6 --take 1 --recipe 1 --cut --cook --output games/cooking/level4-seed4.z8
!tw-make tw-cooking --seed 5 --go 6 --take 1 --recipe 1 --cut --cook --output games/cooking/level4-seed5.z8

!tw-make tw-cooking --seed 1 --go 6 --take 2 --recipe 2 --cut --cook --output games/cooking/level5-seed1.z8
!tw-make tw-cooking --seed 2 --go 6 --take 2 --recipe 2 --cut --cook --output games/cooking/level5-seed2.z8
!tw-make tw-cooking --seed 3 --go 6 --take 2 --recipe 2 --cut --cook --output games/cooking/level5-seed3.z8
!tw-make tw-cooking --seed 4 --go 6 --take 2 --recipe 2 --cut --cook --output games/cooking/level5-seed4.z8
!tw-make tw-cooking --seed 5 --go 6 --take 2 --recipe 2 --cut --cook --output games/cooking/level5-seed5.z8

Global seed: 1
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level1-seed1.z8
Global seed: 2
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level1-seed2.z8
Global seed: 3
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level1-seed3.z8
Global seed: 4
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level1-seed4.z8
Global seed: 5
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level1-seed5.z8
Global seed: 1
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level2-seed1.z8
Global seed: 2
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level2-seed2.z8
Global seed: 3
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level2-seed3.z8
Global seed: 4
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/games/cooking/level2-seed4.z8
Global seed: 5
Game generated: /TextWorld/notebooks/LLM-PTG/Code/Testing/

## Game running

In [9]:
n_think_indices = [0, 10]
n_episodes = 1
max_steps = 100

In [10]:
zipped_levels = list(zip(levels.keys(), levels.values()))
print(zipped_levels)

[('coin_collector', 6), ('cooking', 5), ('treasure_hunter', 8)]


In [None]:
for n in n_think_indices:
    for game in levels.keys():
        for level in range(1, levels[game]+1):
            results_all_seeds = []
            for seed in seeds:
                results = play(LLMAgentSelfEvaluate(selfeval_turns=n, verbose=False),
                               f"games/{game}/level{level}-seed{seed}.z8", max_steps=max_steps, n_episodes=n_episodes)
                results_all_seeds.append(results[0]) # only one episode
            with open(f"./Testing 4/{n}think_{game}_level{level}", "wb") as f:
                pickle.dump(results_all_seeds, f) # so we dump already with grouping by seed
                print("Data pickled.")
                f.close()

level1-seed1.z8.  	avg. steps:   1.0; avg. score:  1.0 / 1.
level1-seed2.z8.  	avg. steps:   1.0; avg. score:  1.0 / 1.
level1-seed3.z8.  	avg. steps:   1.0; avg. score:  1.0 / 1.
level1-seed4.z8.  	avg. steps:   1.0; avg. score:  1.0 / 1.
level1-seed5.z8.  	avg. steps:   1.0; avg. score:  1.0 / 1.
Data pickled.
level2-seed1.z8.  	avg. steps:   2.0; avg. score:  1.0 / 1.
level2-seed2.z8.  	avg. steps:   2.0; avg. score:  1.0 / 1.
level2-seed3.z8.  	avg. steps:   2.0; avg. score:  1.0 / 1.
level2-seed4.z8.  	avg. steps:   2.0; avg. score:  1.0 / 1.
level2-seed5.z8.  	avg. steps:   2.0; avg. score:  1.0 / 1.
Data pickled.
level3-seed1.z8.  	avg. steps:  88.0; avg. score:  1.0 / 1.
level3-seed2.z8

# Data aggregation for visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import bootstrap

In [None]:
coin_nothink = np.array()
coin_10think = np.array()
treasure_nothink = np.array()
treasure_10think = np.array()
cooking_nothink = np.array()
cooking_10think = np.array()

for n in n_think_indices:
    for read_bool in (True, False):
        avg_final_score = 0
        final_scores = []
        for seed in seeds:
            results = []
            with open(f'./Testing 3/{n}think_{"blind_" if not read_bool else ""}_seed{seed}.pickle', 'rb') as f:
                pickle.load(results, f)
                print("Data pickled.")
                f.close()
            final_scores.append(results[0][-1][1] / max_scores[seeds.index(seed)]) # run 1, last step, score normalized

        avg_final_score = np.mean(final_scores)
        bootstrap_results = bootstrap(data=(final_scores,), 
                              statistic=np.mean,
                              method="basic",
                              n_resamples=1000,
                              confidence_level=0.9)
        if read_bool:
            avg_final_scores.append(avg_final_score)
            avg_final_scores_ci.append((bootstrap_results.confidence_interval.high, bootstrap_results.confidence_interval.low)) 
        else:
            avg_final_scores_blind.append(avg_final_score)
            avg_final_scores_blind_ci.append((bootstrap_results.confidence_interval.high, bootstrap_results.confidence_interval.low)) 

# Visualization

In [None]:
figsize=(10,7)

In [None]:
x = n_think_indices # n-think

fig, ax = plt.subplots(figsize=figsize)

ax.plot(n, avg_final_scores, label="non-ephemeral self-evaluation", marker=".", linestyle="-", color="red")
ax.plot(n, avg_final_scores_blind, label="ephemeral self-evaluation", marker=".", linestyle="-", color="blue")

ax.fill_between(n,
                np.clip([i[0] for i in avg_final_scores_ci], 0, 1),
                np.clip([i[1] for i in avg_final_scores_ci], 0, 1),
                alpha=0.1, color="red")
ax.fill_between(n,
                np.clip([i[0] for i in avg_final_scores_blind_ci], 0, 1),
                np.clip([i[1] for i in avg_final_scores_blind_ci], 0, 1),
                alpha=0.1, color="blue")


ax.set_title("Average final score of an ephemeral/non-ephemeral $n$-think model with $n=0,...,10$")
ax.set_xlabel('n (number non-self-evaluating turns for every self-evaluating turn)')
ax.set_ylabel('average final score, normalized')

gridlines = np.arange(n[0], n[-1]+1, 1)
ax.set_xticks(gridlines)
ax.grid(axis="x", alpha=0.2)

# ax.set_ylim(0.2, 1.1)

ax.legend()
plt.show()