# Explore the Nav2D environment

In [None]:
import mujoco as mj
import gymnasium as gym
import nav2d
import pyautogui

# TODO - test the rendering in "rgb_array"
width = 1920
height = 1080
default_camera_config = {"azimuth" : 90, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}

# Reference for setting visual flags https://mujoco.readthedocs.io/en/stable/APIreference/APItypes.html#mjtvisflag
visual_options = {2: True, 8: True}      # e.g., visualize the joints by setting mjVIS_JOINT (index 2) = True

# There are a few visualization things that cannot be set when making the env
# Ref - https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/mujoco/mujoco_rendering.py
# They can only be changed via keypresses in the gymnasium mujoco rendering. 
# Here are some flags to specify what key to press. The key presses are simulated using `pyautogui`
# (Super rough appproach but oh well)
DEFAULT_CAMERA = "overhead_camera"
ENABLE_FRAME = True                     # enable the body frames
RENDER_EVERY_FRAME = True              # similar sim speed as MuJoCo rendering when set to False, else slower

env = gym.make("Nav2D-v0", 
               render_mode="human", 
               width=width, height=height,
               default_camera_config=default_camera_config,
               visual_options=visual_options
               )
obs, info = env.reset()

# Simulate keypress for visualization elements in gymnasium MuJoCo rendering
if DEFAULT_CAMERA=="overhead_camera": pyautogui.press('tab')
if ENABLE_FRAME: pyautogui.press('e') 
if not RENDER_EVERY_FRAME: pyautogui.press('d') 

# TODO - test resetting with the randomize flags
# TODO - test reward (esp when terminated due to 1. goal and 2. obstacle)
for i in range(1):
    done = False
    # enable the body frame by simulating a keypress once lmao
    
    while not done:
        # TODO - when setting the action as env.action_space.sample, the motion is very slow. Need to find a good action_space bound in nav2d.py
        action = [1.0, 0, 1.0]
        nobs, rew, term, trunc, info = env.step(action)
        
        done = term or trunc
        obs = nobs if not done else env.reset()[0]
        # if done: print(nobs, info)

# TODO - quitting does not stop very gracefully. Why?
# When Esc from the rendering window, it says "Pressed ESC. Quitting." but never finish
# Stopping from the notebook throw a huge KeyboardInterrupt error
env.close()

# Custom TD3 Training

In [1]:
# --- import the custom-made TD3 algorithm
import gymnasium as gym
import nav2d        # Have to import the nav2d Python script, else we can't make env
import numpy as np
import os, re
import sys
sys.path.insert(0,'..')
from algorithms import TD3

In [None]:
model_registry = {
    'TD3_v0': {
        'actor_config': [256, 256],
        'critic_config': [256, 256]
    },
}

MODEL_NAME = 'TD3_v0'
ALPHA1 = 1e-3
ALPHA2 = 1e-3
BETA = 1e-3
GAMMA = 0.99
TAU_C = 5e-3
TAU_A = 5e-3
SIGMA = 0.2
CLIP = 0.5

BUFFER_SIZE = 10_000
BUFFER_INIT = 1_000
BATCH_SIZE = 512
  
UPDATE_FREQ = 2
UPDATE_STEP = 2
TRAIN_ITER = 100_000
TRAIN_CRIT = {"pass_limit": 3, "pass_score": -10, 'coeff_var_limit': 1.0}
RESULT_FOLDER = 'Nav2D_TD3_results'
CUDA_ENABLED = True
EARLY_STOP = True

In [None]:
env = gym.make("Nav2D-v0", render_mode="human")

for i in range(1):    
    seed = np.random.randint(1,100)
    TD3_experiment = TD3(model_name = MODEL_NAME, model_registry=model_registry, env=env,
                     alpha1=ALPHA1,alpha2=ALPHA2,beta=BETA,gamma=GAMMA,
                     tau_c=TAU_C,tau_a=TAU_A,sigma=SIGMA,clip=CLIP,
                     buffer_size=BUFFER_SIZE,buffer_init=BUFFER_INIT, batch_size=BATCH_SIZE, 
                     update_f=UPDATE_FREQ, update_step=UPDATE_STEP, iter=TRAIN_ITER,
                     seed=seed,
                     train_crit=TRAIN_CRIT,
                     result_folder=RESULT_FOLDER,
                     cuda_enabled=CUDA_ENABLED)                 
    TD3_experiment.train(early_stop=EARLY_STOP,verbose=True)         

run_00007:   1%|[38;2;46;111;64m▎                                            [0m| 560/100000 [00:11<2:09:38, 12.78it/s][0m

Good training at episode    0 with reward of 728.220. Evaluation results μ=-202.567, σ=59.582, CV= 0.294


run_00007:   1%|[38;2;46;111;64m▌                                             [0m| 1119/100000 [00:17<50:06, 32.89it/s][0m

Good training at episode    1 with reward of 1774.696. Evaluation results μ=1844.106, σ=44.054, CV= 0.024


run_00007:   2%|[38;2;46;111;64m▊                                             [0m| 1643/100000 [00:22<50:36, 32.39it/s][0m

Good training at episode    2 with reward of 1848.317. Evaluation results μ=1858.790, σ=51.306, CV= 0.028


run_00007:   2%|[38;2;46;111;64m█                                             [0m| 2198/100000 [00:28<49:25, 32.98it/s][0m

Good training at episode    3 with reward of 1790.798. Evaluation results μ=1847.488, σ=48.958, CV= 0.026


run_00007:   3%|[38;2;46;111;64m█▏                                            [0m| 2707/100000 [00:34<53:19, 30.41it/s][0m

Good training at episode    4 with reward of 1849.939. Evaluation results μ=1847.059, σ=48.377, CV= 0.026


run_00007:   3%|[38;2;46;111;64m█▌                                            [0m| 3274/100000 [00:39<53:36, 30.07it/s][0m

Good training at episode    5 with reward of 1829.112. Evaluation results μ=1881.734, σ=48.458, CV= 0.026


run_00007:   4%|[38;2;46;111;64m█▋                                          [0m| 3810/100000 [00:45<1:09:54, 22.93it/s][0m

Good training at episode    6 with reward of 1836.558. Evaluation results μ=1853.905, σ=53.680, CV= 0.029


run_00007:   4%|[38;2;46;111;64m█▉                                          [0m| 4340/100000 [00:51<1:09:26, 22.96it/s][0m

Good training at episode    7 with reward of 1872.901. Evaluation results μ=1867.525, σ=47.499, CV= 0.025


run_00007:   5%|[38;2;46;111;64m██▎                                           [0m| 4894/100000 [00:57<53:13, 29.78it/s][0m

Good training at episode    8 with reward of 1881.086. Evaluation results μ=1828.246, σ=41.726, CV= 0.023


run_00007:   5%|[38;2;46;111;64m██▌                                           [0m| 5449/100000 [01:03<50:31, 31.19it/s][0m

Good training at episode    9 with reward of 1807.877. Evaluation results μ=1842.286, σ=46.888, CV= 0.025


run_00007:   6%|[38;2;46;111;64m██▊                                           [0m| 5987/100000 [01:08<48:32, 32.27it/s][0m

Good training at episode   10 with reward of 1843.657. Evaluation results μ=1842.375, σ=44.877, CV= 0.024


run_00007:   7%|[38;2;46;111;64m██▉                                           [0m| 6520/100000 [01:13<54:18, 28.68it/s][0m

Good training at episode   11 with reward of 1873.170. Evaluation results μ=1866.524, σ=56.542, CV= 0.030


run_00007:   7%|[38;2;46;111;64m███▎                                          [0m| 7075/100000 [01:19<43:04, 35.95it/s][0m

Good training at episode   12 with reward of 1880.577. Evaluation results μ=1868.379, σ=50.601, CV= 0.027


run_00007:   8%|[38;2;46;111;64m███▌                                          [0m| 7621/100000 [01:25<50:32, 30.47it/s][0m

Good training at episode   13 with reward of 1767.724. Evaluation results μ=1835.451, σ=40.721, CV= 0.022


run_00007:   8%|[38;2;46;111;64m███▊                                          [0m| 8170/100000 [01:31<51:03, 29.98it/s][0m

Good training at episode   14 with reward of 1782.939. Evaluation results μ=1850.162, σ=39.324, CV= 0.021


run_00007:   9%|[38;2;46;111;64m████                                          [0m| 8712/100000 [01:36<52:21, 29.06it/s][0m

Good training at episode   15 with reward of 1867.613. Evaluation results μ=1856.829, σ=56.080, CV= 0.030


run_00007:   9%|[38;2;46;111;64m████▎                                         [0m| 9264/100000 [01:42<49:58, 30.26it/s][0m

Good training at episode   16 with reward of 1855.215. Evaluation results μ=1862.158, σ=52.471, CV= 0.028


run_00007:  10%|[38;2;46;111;64m████▌                                         [0m| 9815/100000 [01:48<49:41, 30.25it/s][0m

Good training at episode   17 with reward of 1812.580. Evaluation results μ=1846.688, σ=46.125, CV= 0.025


run_00007:  10%|[38;2;46;111;64m████▋                                        [0m| 10335/100000 [01:54<49:53, 29.95it/s][0m

Good training at episode   18 with reward of 1881.030. Evaluation results μ=1850.014, σ=55.288, CV= 0.030


run_00007:  10%|[38;2;46;111;64m████▋                                        [0m| 10451/100000 [01:54<15:36, 95.65it/s][0m

In [None]:
TD3_experiment.reward_hist

# SB3 Training

In [None]:
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
import nav2d        # Have to import the nav2d Python script, else we can't make env
import os, re

In [None]:
env = make_vec_env("Nav2D-v0", n_envs=8, vec_env_cls=DummyVecEnv)
model = TD3("MlpPolicy", env, 
            learning_rate=5e-3,        # lr for all networds - Q-values, Actor, Value function
            buffer_size=10_000,      # replay buffer size
            learning_starts=1_000,        # # of data collection step before training
            batch_size=1_000,
            tau=5e-3,                  # polyak update coefficient
            gamma=0.99,
            train_freq=1,
            gradient_steps=4, 
            action_noise=None, 
            n_steps=1,                  # n-step TD learning
            policy_delay=2,             # the policy and target networks are updated every policy_delay steps
            target_policy_noise=0.05,   # stdev of noise added to target policy
            target_noise_clip=0.1,      # limit of asbsolute value of noise
            verbose=2)
model.learn(total_timesteps=100_000)

# Save the Trained Model

In [None]:
BASE_DIR = os.getcwd()
RESULT_FOLDER = 'Nav2D_TD3_SB3_results'
RESULT_DIR = os.path.join(BASE_DIR, RESULT_FOLDER)
existing_runs = [d for d in os.listdir(RESULT_DIR) if os.path.exists(os.path.join(RESULT_DIR,d))]
run_numbers = [int(re.search(r'run_(\d{5})',d).group(1)) for d in existing_runs if re.match(r'run_\d{5}',d)]
# model.save('reacher')

trial_number = max(run_numbers, default=-1) + 1
model.save(f'{RESULT_FOLDER}/run_{trial_number:05d}')

# Load and Simulate the Model

In [None]:
import pyautogui

model_load = TD3.load('Nav2D_TD3_SB3_results/run_00005')

width = 1920
height = 1080
default_camera_config = {"azimuth" : 90.0, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}
camera_id = 2

DEFAULT_CAMERA = "overhead_camera"
ENABLE_FRAME = True                     # enable the body frames
RENDER_EVERY_FRAME = True              # similar sim speed as MuJoCo rendering when set to False, else slower

test_env = gym.make("Nav2D-v0", render_mode='human', 
                    width=width,height=height,
                    default_camera_config=default_camera_config,
                    camera_id=camera_id,
                    # frame_skip=2,
                    # camera_name="camera",
                    # max_episode_steps=100
                    )
obs, info = test_env.reset()

if DEFAULT_CAMERA=="overhead_camera": pyautogui.press('tab')
if ENABLE_FRAME: pyautogui.press('e') 
if not RENDER_EVERY_FRAME: pyautogui.press('d') 

for eps in range(5):
    obs, _ = test_env.reset()
    dones = False

    while not dones:
        action, _ = model_load.predict(obs, deterministic=True)
        nobs, rewards, dones, info, _ = test_env.step(action)
        obs = nobs if not dones else test_env.reset()
        # vec_env.render("human")

test_env.close()