# Test 4 - Comparison vs FM paper: CoinCollector, CookingGame, TreasureHunter

In [22]:
! tar czf Testing.tar *

In [None]:
! nvidia-smi

In [None]:
import textworld
import textworld.gym

In [None]:
import pickle
import random

In [None]:
import sys
sys.path.append("../Self-evaluation")

from self_evaluation import play, LLMAgentSelfEvaluate

ModuleNotFoundError: No module named 'textworld'

## Game generation

Idea:
- two models used: no-think and 10-think
- different difficulty levels: 6 for coin collector, ~~5 for common sense,~~ 5 for cooking, 8 for treasure hunter
- 5 episodes per case (vs 40 in FM paper)
- total episodes: ~~240~~ (190 without commonsense)

In [None]:
levels = {"coin_collector": 6, "cooking": 5, "treasure_hunter": 8}
seeds = range(1,6)

In [None]:
import subprocess

for game in levels.keys():
    for level in range(1, levels[game]+1):
        for seed in seeds:
            subprocess.run(["tw-make", f"tw-{game}", "--level", level, "--seed", seed, "--output", f"games/{game}/level{level}-seed{seed}.z8"]) 

Or:

In [None]:
!tw-make tw-coin_collector --seed 1 --level 1 --output games/coin/seed1-level1.z8
!tw-make tw-coin_collector --seed 1 --level 2 --output games/coin/seed1-level2.z8
!tw-make tw-coin_collector --seed 1 --level 3 --output games/coin/seed1-level3.z8
!tw-make tw-coin_collector --seed 1 --level 4 --output games/coin/seed1-level4.z8
!tw-make tw-coin_collector --seed 1 --level 5 --output games/coin/seed1-level5.z8
!tw-make tw-coin_collector --seed 1 --level 6 --output games/coin/seed1-level6.z8

In [None]:
!tw-make tw-cooking --seed 1 --level 1 --output games/cooking/seed1-level1.z8
!tw-make tw-cooking --seed 1 --level 2 --output games/cooking/seed1-level2.z8
!tw-make tw-cooking --seed 1 --level 3 --output games/cooking/seed1-level3.z8
!tw-make tw-cooking --seed 1 --level 4 --output games/cooking/seed1-level4.z8
!tw-make tw-cooking --seed 1 --level 5 --output games/cooking/seed1-level5.z8

In [None]:
!tw-make tw-treasure_hunter --seed 1 --level 1 --output games/treasure/seed1-level1.z8
!tw-make tw-treasure_hunter --seed 1 --level 2 --output games/treasure/seed1-level2.z8
!tw-make tw-treasure_hunter --seed 1 --level 3 --output games/treasure/seed1-level3.z8
!tw-make tw-treasure_hunter --seed 1 --level 4 --output games/treasure/seed1-level4.z8
!tw-make tw-treasure_hunter --seed 1 --level 5 --output games/treasure/seed1-level5.z8
!tw-make tw-treasure_hunter --seed 1 --level 6 --output games/treasure/seed1-level6.z8
!tw-make tw-treasure_hunter --seed 1 --level 7 --output games/treasure/seed1-level7.z8
!tw-make tw-treasure_hunter --seed 1 --level 8 --output games/treasure/seed1-level8.z8

## Game running

In [None]:
n_think_indices = [0, 10]
n_episodes = 1
max_steps = 100

In [None]:
for n in n_think_indices:
    for game in levels.keys():
        for level in range(1, levels[game]+1):
            results_all_seeds = []
            for seed in seeds:
                results = play(LLMAgentSelfEvaluate(selfeval_turns=n, verbose=False),
                               f"games/{game}/level{level}-seed{seed}.z8", max_steps=max_steps, n_episodes=n_episodes)
                results_all_seeds.append(results[0]) # only one episode
            with open(f"./Testing 4/{n}think_{game}_level{level}", "wb") as f:
                pickle.dump(results_all_seeds, f) # so we dump already with grouping by seed
                print("Data pickled.")
                f.close()

# Data aggregation for visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import bootstrap

In [None]:
coin_nothink = np.array()
coin_nothink_ci = np.array()
coin_10think = np.array()
coin_10think_ci = np.array()
treasure_nothink = np.array()
treasure_nothink_ci = np.array()
treasure_10think = np.array()
treasure_10think_ci = np.array()
cooking_nothink = np.array()
cooking_nothink_ci = np.array()
cooking_10think = np.array()
cooking_10think_ci = np.array()

final_arrays = {"coin_collector": {0: (coin_nothink, coin_nothink_ci), 10: (coin_10think, coin_10think_ci)},
                "treasure_hunter": {0: (treasure_nothink, treasure_nothink_ci), 10: (treasure_10think, treasure_10think_ci)},
                "cooking": {0: (cooking_nothink, cooking_nothink_ci), 10: (cooking_10think, cooking_10think_ci)}}

for n in n_think_indices:
    for game in levels.keys():
        for level in range(1, levels[game]+1):
            results_all_seeds = []
            with open(f"./Testing 4/{n}think_{game}_level{level}", "rb") as f:
                pickle.load(results_all_seeds, f)
                f.close()
            final_scores = [run[-1][1] for run in results_all_seeds] # last step, score
            avg_final_score = np.mean(final_scores) # i think they're already normalized to 1
            bootstrap_results = bootstrap(data=(final_scores,), 
                                statistic=np.mean,
                                method="basic",
                                n_resamples=1000,
                                confidence_level=0.9)
            final_arrays[game][n][0].append(avg_final_score)
            final_arrays[game][n][1].append((bootstrap_results.confidence_interval.high, bootstrap_results.confidence_interval.low))


# Visualization

In [None]:
figsize=(11,7)

In [None]:
fig, axes = plt.subplots(1,3, figsize=figsize)
prettified_names = {"coin_collector": "Coin Collector", "treasure_hunter": "Treasure Hunter", "cooking": "Cooking Game"}

for i, game in enumerate(final_arrays.keys()):
    ax = axes[i]
    x = range(1, levels[game]+1)
    y_0, y_0_ci = final_arrays[game][0]
    y_10, y_10_ci = final_arrays[game][10]


    ax.plot(n, y_0, label="0-think", marker=".", linestyle="-", color="red")
    ax.plot(n, y_10, label="10-think", marker=".", linestyle="-", color="blue")

    ax.fill_between(n,
                    np.clip([i[0] for i in y_0_ci], 0, 1),
                    np.clip([i[1] for i in y_0_ci], 0, 1),
                    alpha=0.1, color="red")
    ax.fill_between(n,
                    np.clip([i[0] for i in y_10_ci], 0, 1),
                    np.clip([i[1] for i in y_10_ci], 0, 1),
                    alpha=0.1, color="blue")


    ax.set_title(f"Average final score over 10 seeds in {prettified_names[game]}")
    ax.set_xlabel('level')
    ax.set_ylabel('average final score')
    ax.legend()

plt.show()