In [1]:
import gym
from gym_minigrid.wrappers import ImgObsWrapper
from mini_behavior.utils.wrappers import MiniBHFullyObsWrapper
from mini_behavior.register import register
from algorithms.APT_PPO import APT_PPO
import mini_behavior
from stable_baselines3 import PPO
import numpy as np
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
import torch.nn as nn
import torch
from env_wrapper import CustomObservationWrapper
import argparse

class MinigridFeaturesExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 512, normalized_image: bool = False) -> None:
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, (2, 2)),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

TASK = 'MultiToy'
PARTIAL_OBS = True
ROOM_SIZE = 10
MAX_STEPS = 1000
TOTAL_TIMESTEPS = 1e5
DENSE_REWARD = False
POLICY_TYPE = 'CnnPolicy'
NUM_ENVS = 8
NUM_STEPS = 125
env_name = f"MiniGrid-{TASK}-{ROOM_SIZE}x{ROOM_SIZE}-N2-v0"

In [2]:
env_kwargs = {"room_size": ROOM_SIZE, "max_steps": MAX_STEPS, "exploration_type": "ATP"}
def get_single_env() -> gym.Env:
    '''
    policy_kwargs = dict(
    features_extractor_class=MinigridFeaturesExtractor,
    features_extractor_kwargs=dict(features_dim=128)
)
    '''

    # Env wrapping
    env_name = f"MiniGrid-{TASK}-{ROOM_SIZE}x{ROOM_SIZE}-N2-v0"

    kwargs = {"room_size": ROOM_SIZE, "max_steps": MAX_STEPS}


    register(
        id=env_name,
        entry_point=f'mini_behavior.envs:{TASK}Env',
        kwargs=kwargs
    )

    '''
    config = {
        "policy_type": POLICY_TYPE,
        "total_timesteps": TOTAL_TIMESTEPS,
        "env_name": env_name,
    }
    '''
    
    env = gym.make(env_name)
    env = CustomObservationWrapper(env)

    return env


def init_env(num_envs: int):

    env_fns = [lambda: get_single_env() for _ in range(num_envs)]
    if num_envs == 1:
        return DummyVecEnv(env_fns)
    else:
        return SubprocVecEnv(env_fns)
    




In [3]:
env = get_single_env()


=== Observation Space ===
Shape: (59,)
Type: float32


In [14]:
print(env.gen_obs())

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (68,) + inhomogeneous part.

In [None]:
env = init_env(NUM_ENVS)
    
print('begin training')
# Policy training
model = APT_PPO(env, num_envs=NUM_ENVS, total_timesteps = TOTAL_TIMESTEPS, num_steps=NUM_STEPS, save_freq = 10)

model.train()

In [4]:
import os

# Define the directory path
save_dir = "models/ATP_PPO_Trial"

# Check if the directory exists, and if not, create it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
model.save(f"{save_dir}/{env_name}", env_kwargs= env_kwargs)

In [5]:
checkpoint = torch.load("/Users/kevinhan/mini_behavior/models/ATP_PPO_Trial/MiniGrid-PlayAlligator-10x10-N2-v0")

In [6]:
print(checkpoint.keys())

dict_keys(['env_kwargs', 'model_saves', 'final_model_state_dict', 'final_optimizer_state_dict', 'learning_rate', 'total_timesteps', 'num_envs', 'num_steps', 'curiosity_rewards', 'actions', 'observations'])


In [7]:
print(checkpoint['env_kwargs'])


{'room_size': 10, 'max_steps': 1000, 'exploration_type': 'ATP'}


In [9]:
print(checkpoint['observations'][0][0])


tensor([[4., 7., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [1., 7., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [6., 8., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [3., 8., 3., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [5., 5., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [5., 7., 3., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0.],
        [4., 8., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
        [5., 7., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.]])


In [8]:
print(rewards)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8b4849dd60>>
Traceback (most recent call last):
  File "/Users/kevinhan/opt/anaconda3/envs/babyRL/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


NameError: name 'rewards' is not defined

In [41]:
support_cache = supports(armies)

In [42]:
print(support_cache)

{'B': 1, 'C': 1, 'D': 1}


In [13]:
from numpy import linalg as LA
p1, p2 = np.array([0, 0]), np.array([4, 4])
dp = LA.norm((p1 - p2))
print(np.sqrt(32))
print(dp)

5.656854249492381
5.656854249492381
