# Test 4 - Comparison vs FM paper: CoinCollector, CookingGame, TreasureHunter

In [22]:
! tar czf Testing.tar *

In [None]:
! nvidia-smi

In [None]:
import textworld
import textworld.gym

In [None]:
import pickle
import random

In [None]:
import sys
sys.path.append("../Self-evaluation")

from self_evaluation import play, LLMAgentSelfEvaluate

ModuleNotFoundError: No module named 'textworld'

## Game generation

Idea:
- two models used: no-think and 10-think
- different difficulty levels: 6 for coin collector, 5 for common sense, 5 for cooking, 8 for treasure hunter
- 5 episodes per case (vs 40 in FM paper)
- total episodes: 240 (190 without commonsense)

In [None]:
!tw-make tw-coin_collector --seed 1 --level 1 --output games/coin/seed1-level1.z8
!tw-make tw-coin_collector --seed 1 --level 2 --output games/coin/seed1-level2.z8
!tw-make tw-coin_collector --seed 1 --level 3 --output games/coin/seed1-level3.z8
!tw-make tw-coin_collector --seed 1 --level 4 --output games/coin/seed1-level4.z8
!tw-make tw-coin_collector --seed 1 --level 5 --output games/coin/seed1-level5.z8
!tw-make tw-coin_collector --seed 1 --level 6 --output games/coin/seed1-level6.z8

In [None]:
!tw-make tw-cooking --seed 1 --level 1 --output games/cooking/seed1-level1.z8
!tw-make tw-cooking --seed 1 --level 2 --output games/cooking/seed1-level2.z8
!tw-make tw-cooking --seed 1 --level 3 --output games/cooking/seed1-level3.z8
!tw-make tw-cooking --seed 1 --level 4 --output games/cooking/seed1-level4.z8
!tw-make tw-cooking --seed 1 --level 5 --output games/cooking/seed1-level5.z8

In [None]:
!tw-make tw-treasure_hunter --seed 1 --level 1 --output games/treasure/seed1-level1.z8
!tw-make tw-treasure_hunter --seed 1 --level 2 --output games/treasure/seed1-level2.z8
!tw-make tw-treasure_hunter --seed 1 --level 3 --output games/treasure/seed1-level3.z8
!tw-make tw-treasure_hunter --seed 1 --level 4 --output games/treasure/seed1-level4.z8
!tw-make tw-treasure_hunter --seed 1 --level 5 --output games/treasure/seed1-level5.z8
!tw-make tw-treasure_hunter --seed 1 --level 6 --output games/treasure/seed1-level6.z8
!tw-make tw-treasure_hunter --seed 1 --level 7 --output games/treasure/seed1-level7.z8
!tw-make tw-treasure_hunter --seed 1 --level 8 --output games/treasure/seed1-level8.z8

## Game running

In [3]:
n_think_indices = [0, 10]
levels = {"coin": 6, "cooking": 5, "treasure": 8}
n_episodes = 5
max_steps = 100


In [13]:
zipped_levels = list(zip(levels.keys(), levels.values()))
print(zipped_levels)

[('coin', 6), ('cooking', 5), ('treasure', 8)]


In [None]:
for n in n_think_indices:
    for game_name in levels.keys():
        for level in levels[game_name]:
            results = play(LLMAgentSelfEvaluate(selfeval_turns=n, verbose=False),
                        f"games/{game_name}/seed1-level{level}", max_steps=max_steps,  n_episodes=n_episodes)
            with open(f'./Testing 4/{n}think_{game_name}_level{level}.pickle', 'wb') as f:
                pickle.dump(results, f)
                print("Data pickled.")
                f.close()

# Data aggregation for visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import bootstrap

In [None]:
coin_nothink = np.array()
coin_10think = np.array()
treasure_nothink = np.array()
treasure_10think = np.array()
cooking_nothink = np.array()
cooking_10think = np.array()

for n in n_think_indices:
    for read_bool in (True, False):
        avg_final_score = 0
        final_scores = []
        for seed in seeds:
            results = []
            with open(f'./Testing 3/{n}think_{"blind_" if not read_bool else ""}_seed{seed}.pickle', 'rb') as f:
                pickle.load(results, f)
                print("Data pickled.")
                f.close()
            final_scores.append(results[0][-1][1] / max_scores[seeds.index(seed)]) # run 1, last step, score normalized

        avg_final_score = np.mean(final_scores)
        bootstrap_results = bootstrap(data=(final_scores,), 
                              statistic=np.mean,
                              method="basic",
                              n_resamples=1000,
                              confidence_level=0.9)
        if read_bool:
            avg_final_scores.append(avg_final_score)
            avg_final_scores_ci.append((bootstrap_results.confidence_interval.high, bootstrap_results.confidence_interval.low)) 
        else:
            avg_final_scores_blind.append(avg_final_score)
            avg_final_scores_blind_ci.append((bootstrap_results.confidence_interval.high, bootstrap_results.confidence_interval.low)) 

# Visualization

In [None]:
figsize=(10,7)

In [None]:
x = n_think_indices # n-think

fig, ax = plt.subplots(figsize=figsize)

ax.plot(n, avg_final_scores, label="non-ephemeral self-evaluation", marker=".", linestyle="-", color="red")
ax.plot(n, avg_final_scores_blind, label="ephemeral self-evaluation", marker=".", linestyle="-", color="blue")

ax.fill_between(n,
                np.clip([i[0] for i in avg_final_scores_ci], 0, 1),
                np.clip([i[1] for i in avg_final_scores_ci], 0, 1),
                alpha=0.1, color="red")
ax.fill_between(n,
                np.clip([i[0] for i in avg_final_scores_blind_ci], 0, 1),
                np.clip([i[1] for i in avg_final_scores_blind_ci], 0, 1),
                alpha=0.1, color="blue")


ax.set_title("Average final score of an ephemeral/non-ephemeral $n$-think model with $n=0,...,10$")
ax.set_xlabel('n (number non-self-evaluating turns for every self-evaluating turn)')
ax.set_ylabel('average final score, normalized')

gridlines = np.arange(n[0], n[-1]+1, 1)
ax.set_xticks(gridlines)
ax.grid(axis="x", alpha=0.2)

# ax.set_ylim(0.2, 1.1)

ax.legend()
plt.show()