# Testing envs

In [16]:
import os

print(os.getcwd())
if not os.getcwd().endswith("app"):
    os.chdir("../app")
    print(os.getcwd())

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

%load_ext autoreload
%autoreload 2
%matplotlib inline

/home/mique/Desktop/Code/rl-module/app
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2048

In [24]:
from src.custom_envs import Env2048
from maikol_utils.print_utils import print_separator

env = Env2048(new_boxes_per_step=1)
state, info = env.reset(42)
for s in state:
    print_separator()
    print(s.T) 


________________________________________________________________
[[2 0 0 0]
 [4 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
________________________________________________________________
[[2 0 0 0]
 [4 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
________________________________________________________________
[[2 0 0 0]
 [4 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
________________________________________________________________
[[0 0 0 2]
 [0 0 0 4]
 [0 0 0 0]
 [0 0 0 0]]
________________________________________________________________
[[0 0 0 0]
 [0 0 0 0]
 [2 0 0 0]
 [4 0 0 0]]


In [27]:
state, reward, terminated, truncated, info = env.step(2)
print(f"{reward = }")
print(f"{terminated = }")
print(f"{truncated = }")
for s in state:
    print_separator()
    print(s.T) 


reward = np.float64(0.00244140625)
terminated = False
truncated = False
________________________________________________________________
[[0 0 0 2]
 [0 0 2 4]
 [0 2 0 0]
 [0 0 0 2]]
________________________________________________________________
[[2 0 0 0]
 [2 4 0 0]
 [2 0 0 0]
 [2 0 0 0]]
________________________________________________________________
[[0 2 2 2]
 [0 0 0 4]
 [0 0 0 2]
 [0 0 0 0]]
________________________________________________________________
[[0 0 0 2]
 [0 0 2 4]
 [0 0 0 2]
 [0 0 0 2]]
________________________________________________________________
[[0 0 0 0]
 [0 0 0 2]
 [0 0 0 4]
 [0 2 2 2]]


In [62]:
state, reward, terminated, truncated, info = env.step(3)
state.T

array([[16,  0,  0,  2],
       [ 8,  0,  0,  0],
       [ 2,  2,  2,  0],
       [ 4,  4,  8,  4]], dtype=int16)

### Playing with trained agent

In [None]:
from src.functions import evaluate_agent
from src.utils import load_agent, set_seed

from src.config import Configuration

from maikol_utils.file_utils import clear_directories, make_dirs
from maikol_utils.print_utils import print_separator

import torch
import numpy as np
from tqdm import tqdm
from time import time

from maikol_utils.print_utils import print_separator
from maikol_utils.time_tracker import print_time

from src.models import get_envs, handle_states


CONFIG = Configuration(
    env_id="Env2048",
    exp_name="Env2048",
    record_video=True
)

In [12]:
print_separator("CONFIGURATION", sep_type="LONG")
make_dirs(CONFIG.videos_path)

envs = get_envs(CONFIG, evaluating=True)
agent = load_agent(CONFIG)

print(f" - Creating vars...")
episode_rewards = []
states = handle_states(CONFIG, envs.reset()[0])
dones = torch.zeros(CONFIG.n_envs, dtype=bool)
start_time = time()

states

________________________________________________________________________________________________________________________________
                                                         CONFIGURATION                                                          

 - Loading agent at ../models/Env2048/Env2048-v0.pt
 - Creating vars...


tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 2.],
         [0., 0., 0., 0.],
         [0., 0., 0., 2.]],

        [[0., 4., 0., 0.],
         [0., 0., 0., 0.],
         [0., 4., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 2., 0., 0.],
         [0., 0., 0., 0.],
         [0., 4., 0., 0.]],

        [[0., 0., 4., 0.],
         [4., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]], device='cuda:0')

In [113]:
# ================================================================
#                       EVALUATING LOOP
# ================================================================
print_separator("EVALUATING", sep_type="SUPER")
print(f" - Evaluating for {CONFIG.n_eval_steps}.")

with torch.no_grad():
    actions, _, _, _ = agent.get_action_value(states)

print(f"{actions = }")

next_states, rewards, terms, truncs, infos = envs.step(actions.cpu().numpy())
states = handle_states(CONFIG, next_states)
dones = terms | truncs

episode_rewards.append(np.array(rewards))
print(f"{dones = }")

if all(dones): # stop early if all envs finished an episode
    print("DONE!")

next_states

                                                           EVALUATING                                                           

 - Evaluating for 20000.
actions = tensor([1, 1, 1, 1], device='cuda:0')
dones = array([False, False, False, False])


array([[[ 8,  2,  8,  0],
        [ 2, 32,  2,  4],
        [16, 32,  0,  0],
        [ 4,  0,  0,  0]],

       [[ 4,  2,  0,  0],
        [ 4,  8,  0,  0],
        [ 0,  0,  2,  0],
        [ 0,  0,  0,  0]],

       [[ 2, 16,  2, 16],
        [16,  4, 16, 32],
        [16,  8,  2,  0],
        [ 4,  4,  0,  0]],

       [[ 2,  8,  4,  2],
        [ 8,  4, 32,  4],
        [ 2,  8,  2,  4],
        [ 4,  2,  0,  0]]], dtype=int16)

In [None]:
# ================================================================
#                            DONE
# ================================================================
print_separator("RESUME", sep_type="LONG")
envs.close()
# If no rewards collected -> return NaNs (safe)
if len(episode_rewards) == 0:
    mean_reward, std_reward = float("nan"), float("nan")
else:
    # episode_rewards: list of (T, n_envs) per-step reward arrays
    rewards_arr = np.stack(episode_rewards, axis=0)      
    per_env_returns = rewards_arr.sum(axis=0)         
    mean_reward = float(np.mean(per_env_returns))
    std_reward  = float(np.std(per_env_returns))

print(f" - Mean rewards: {mean_reward:.4f}+-{std_reward:.4f}")
print_time(time() - start_time, prefix=" - ")
