# Explore the Nav2D environment

In [None]:
import mujoco as mj
import gymnasium as gym
import nav2d
import pyautogui

# TODO - test the rendering in "rgb_array"
width = 1920
height = 1080
default_camera_config = {"azimuth" : 90, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}

# Reference for setting visual flags https://mujoco.readthedocs.io/en/stable/APIreference/APItypes.html#mjtvisflag
visual_options = {2: True, 8: True}      # e.g., visualize the joints by setting mjVIS_JOINT (index 2) = True

# There are a few visualization things that cannot be set when making the env
# Ref - https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/mujoco/mujoco_rendering.py
# They can only be changed via keypresses in the gymnasium mujoco rendering. 
# Here are some flags to specify what key to press. The key presses are simulated using `pyautogui`
# (Super rough appproach but oh well)
DEFAULT_CAMERA = "overhead_camera"
ENABLE_FRAME = True                     # enable the body frames
RENDER_EVERY_FRAME = True              # similar sim speed as MuJoCo rendering when set to False, else slower

env = gym.make("Nav2D-v0", 
               render_mode="human", 
               width=width, height=height,
               default_camera_config=default_camera_config,
               visual_options=visual_options
               )
obs, info = env.reset()

# Simulate keypress for visualization elements in gymnasium MuJoCo rendering
if DEFAULT_CAMERA=="overhead_camera": pyautogui.press('tab')
if ENABLE_FRAME: pyautogui.press('e') 
if not RENDER_EVERY_FRAME: pyautogui.press('d') 

# TODO - test resetting with the randomize flags
# TODO - test reward (esp when terminated due to 1. goal and 2. obstacle)
for i in range(1):
    done = False
    # enable the body frame by simulating a keypress once lmao
    
    while not done:
        # TODO - when setting the action as env.action_space.sample, the motion is very slow. Need to find a good action_space bound in nav2d.py
        action = [1.0, 0, 1.0]
        nobs, rew, term, trunc, info = env.step(action)
        
        done = term or trunc
        obs = nobs if not done else env.reset()[0]
        # if done: print(nobs, info)

# TODO - quitting does not stop very gracefully. Why?
# When Esc from the rendering window, it says "Pressed ESC. Quitting." but never finish
# Stopping from the notebook throw a huge KeyboardInterrupt error
env.close()

# Custom TD3 Training

# SB3 Training

In [1]:
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
import nav2d        # Have to import the nav2d Python script, else we can't make env
import os, re

In [2]:
env = make_vec_env("Nav2D-v0", n_envs=8, vec_env_cls=DummyVecEnv)
model = TD3("MlpPolicy", env, 
            learning_rate=5e-3,        # lr for all networds - Q-values, Actor, Value function
            buffer_size=10_000,      # replay buffer size
            learning_starts=1_000,        # # of data collection step before training
            batch_size=256,
            tau=5e-3,                  # polyak update coefficient
            gamma=0.99,
            train_freq=1,
            gradient_steps=4, 
            action_noise=None, 
            n_steps=1,                  # n-step TD learning
            policy_delay=2,             # the policy and target networks are updated every policy_delay steps
            target_policy_noise=0.05,   # stdev of noise added to target policy
            target_noise_clip=0.1,      # limit of asbsolute value of noise
            verbose=2)
model.learn(total_timesteps=100_000)

Using cuda device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 557       |
|    ep_rew_mean     | -3.59e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 1090      |
|    time_elapsed    | 4         |
|    total_timesteps | 4496      |
| train/             |           |
|    actor_loss      | 26.5      |
|    critic_loss     | 12.2      |
|    learning_rate   | 0.005     |
|    n_updates       | 1744      |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 565       |
|    ep_rew_mean     | -3.66e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 1083      |
|    time_elapsed    | 4         |
|    total_timesteps | 4632      |
| train/             |           |
|    actor_loss      | 27.4      |
|    critic_loss     | 2.17      |
|    learning_rate   | 0.005     |
| 

<stable_baselines3.td3.td3.TD3 at 0x79e721a5b160>

# Save the Trained Model

In [3]:
BASE_DIR = os.getcwd()
RESULT_FOLDER = 'Nav2D_TD3_results'
RESULT_DIR = os.path.join(BASE_DIR, RESULT_FOLDER)
existing_runs = [d for d in os.listdir(RESULT_DIR) if os.path.exists(os.path.join(RESULT_DIR,d))]
run_numbers = [int(re.search(r'run_(\d{5})',d).group(1)) for d in existing_runs if re.match(r'run_\d{5}',d)]
# model.save('reacher')

trial_number = max(run_numbers, default=-1) + 1
model.save(f'{RESULT_FOLDER}/run_{trial_number:05d}')

# Load and Simulate the Model

In [6]:
model_load = TD3.load('Nav2D_TD3_results/run_00005')

width = 1920
height = 1080
default_camera_config = {"azimuth" : 90.0, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}
camera_id = 2

test_env = gym.make("Nav2D-v0", render_mode='human', 
                    width=width,height=height,
                    default_camera_config=default_camera_config,
                    camera_id=camera_id,
                    # frame_skip=2,
                    # camera_name="camera",
                    # max_episode_steps=100
                    )

for eps in range(5):
    obs, _ = test_env.reset()
    dones = False

    while not dones:
        action, _ = model_load.predict(obs, deterministic=True)
        nobs, rewards, dones, info, _ = test_env.step(action)
        obs = nobs if not dones else test_env.reset()
        # vec_env.render("human")

test_env.close()