In [1]:
import gym
from gym_minigrid.wrappers import ImgObsWrapper
from mini_behavior.utils.wrappers import MiniBHFullyObsWrapper
from mini_behavior.register import register
from algorithms.APT_PPO import APT_PPO
import mini_behavior
from stable_baselines3 import PPO
import numpy as np
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
import torch.nn as nn
import torch
import argparse

class MinigridFeaturesExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 512, normalized_image: bool = False) -> None:
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, (2, 2)),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

TASK = 'PlayAlligator'
PARTIAL_OBS = True
ROOM_SIZE = 10
MAX_STEPS = 1000
TOTAL_TIMESTEPS = 512e3
DENSE_REWARD = False
POLICY_TYPE = 'CnnPolicy'
NUM_ENVS = 8
NUM_STEPS = 128
env_name = f"MiniGrid-{TASK}-{ROOM_SIZE}x{ROOM_SIZE}-N2-v0"

In [None]:

def get_single_env() -> gym.Env:
    policy_kwargs = dict(
    features_extractor_class=MinigridFeaturesExtractor,
    features_extractor_kwargs=dict(features_dim=128)
)

    # Env wrapping
    env_name = f"MiniGrid-{TASK}-{ROOM_SIZE}x{ROOM_SIZE}-N2-v0"

    kwargs = {"room_size": ROOM_SIZE, "max_steps": MAX_STEPS, "exploration_type": "ATP"}

    if DENSE_REWARD:
        assert TASK in ["PuttingAwayDishesAfterCleaning", "WashingPotsAndPans"]
        kwargs["dense_reward"] = True

    register(
        id=env_name,
        entry_point=f'mini_behavior.envs:{TASK}Env',
        kwargs=kwargs
    )


    config = {
        "policy_type": POLICY_TYPE,
        "total_timesteps": TOTAL_TIMESTEPS,
        "env_name": env_name,
    }

    env = gym.make(env_name)

    return env


def init_env(num_envs: int):

    env_fns = [lambda: get_single_env() for _ in range(num_envs)]
    if num_envs == 1:
        return DummyVecEnv(env_fns)
    else:
        return SubprocVecEnv(env_fns)
    
env = init_env(NUM_ENVS)
    
print('begin training')
# Policy training
model = APT_PPO(env, num_envs=NUM_ENVS, total_timesteps = TOTAL_TIMESTEPS, num_steps=128)

model.train()



begin training
UPDATE: 1/500
Average reward: 5.3681674
UPDATE: 2/500
Average reward: 11.178738
UPDATE: 3/500
Average reward: 12.385855
UPDATE: 4/500
Average reward: 12.00108
UPDATE: 5/500
Average reward: 11.236308
UPDATE: 6/500
Average reward: 10.41835
UPDATE: 7/500
Average reward: 9.630388
UPDATE: 8/500
Average reward: 8.89255
UPDATE: 9/500
Average reward: 8.20793
UPDATE: 10/500
Average reward: 7.574774
UPDATE: 11/500
Average reward: 6.9899683
UPDATE: 12/500
Average reward: 6.4501133
UPDATE: 13/500
Average reward: 5.951871
UPDATE: 14/500
Average reward: 5.4920826
UPDATE: 15/500
Average reward: 5.0678
UPDATE: 16/500
Average reward: 4.67629
UPDATE: 17/500
Average reward: 4.3150234
UPDATE: 18/500
Average reward: 3.9816651
UPDATE: 19/500
Average reward: 3.6740608
UPDATE: 20/500
Average reward: 3.3902202
UPDATE: 21/500
Average reward: 3.1283078
UPDATE: 22/500
Average reward: 2.8866298
UPDATE: 23/500
Average reward: 2.6636226
UPDATE: 24/500
Average reward: 2.4578438
UPDATE: 25/500
Average r

In [4]:
import os

# Define the directory path
save_dir = "models/ATP_PPO_Trial"

# Check if the directory exists, and if not, create it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
model.save(f"models/ATP_PPO_Trial/{env_name}")

In [5]:
checkpoint = torch.load(f"{save_dir}/{env_name}")


In [6]:
print(checkpoint.keys())

dict_keys(['model_state_dict', 'optimizer_state_dict', 'learning_rate', 'total_timesteps', 'num_envs', 'num_steps'])


In [9]:
print(checkpoint['num_envs'])

8
