# **Introduction**

This notebook serves as an implementation of Soft Actor-Critic (SAC) on the custom-developed 2D navigation environment, titled ``Nav2D-v0``. The goal of this implementation is to quantify the performance of SAC in a simple 2D navigational  task, such that it can be used for incremental learning within subsequent environments.

# **Imports**

This section imports the necessary packages for this implementation.

In [45]:
# import gymnasium related packages:
import gymnasium as gym
from gymnasium.utils.env_checker import check_env

# import custom environments and wrappers:
import nav2d

# import stablebaselines stuff:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import Monitor, make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

# other necessary imports:
from tqdm import tqdm
import pyautogui
import numpy as np
import pandas as pd
import os

# **Function Definitions**

This section defines the functions required for this implementation.

In [46]:
# evaluation function:
def eval(env: gym.Env, 
         num_evals: int, 
         model):
    # reward list:
    eval_rew_hist = []

    # for each episode in the num_evals:
    for _ in range(num_evals):
        obs, _ = env.reset()
        done = False

        # initialize episodic reward:
        eval_rew = 0

        # while False:
        while not done:
            # get action and step:
            action, _ = model.predict(obs, deterministic = True)
            nobs, reward, term, trunc, _ = env.step(action)
            done = term or trunc
            
            # advance reward:
            eval_rew += reward

            # advance observation, reset if not:
            obs = nobs if not done else env.reset()
    
        # append:
        eval_rew_hist.append(eval_rew)

    return np.mean(eval_rew_hist).round(3)

# **Environment Definition and Hyperparameters**

This section defines and verifies the environment, defines the hyperparameters for the model, and creates a model.

In [47]:
# make the environment:
env = gym.make("Nav2D-v0")

# check the environment:
try: 
    check_env(env.unwrapped)
    print(f"Environment passes all checks!")
except Exception as e:
    print(f"Environment has the following issues: \n{e}")

Environment has the following issues: 9.33 | diff: 26.56 | rew_head: -0.15 | rew_dist: -0.33 | total: -0.72                                              
The `.np_random` is not properly been updated after step.


Define hyperparameters:

In [None]:
# hyperparameters:
policy = "MlpPolicy"
gamma = 0.99
learning_rate = 3e-4
buffer_size = int(1e6)
batch_size = 64
tau = 5e-3
ent_coef = "auto_0.1"
train_freq = 1
learning_starts = int(0)
target_update_interval = 1
gradient_steps = 4
target_entropy = "auto"
action_noise = None
verbose = 0

# scaling on rewards:
rew_head_scale = 2.5
rew_dist_scale = 5.0
rew_goal_scale = 2000.0
rew_obst_scale = -1000.0

# vectorize or nah:
vectorize = True
n_envs = 64
render_mode = "human"

Make envs:

In [None]:
# if using a vectorized environment:
if vectorize:
    # make the vectorized environments:
    env = make_vec_env("Nav2D-v0", n_envs = n_envs, env_kwargs = {"reward_scale_options" : {"rew_head_scale" : rew_head_scale, 
                                                                                            "rew_dist_scale" : rew_dist_scale, 
                                                                                            "rew_goal_scale" : rew_goal_scale, 
                                                                                            "rew_obst_scale" : rew_obst_scale},
                                                                  "max_episode_steps" : 1000,
                                                                  "render_mode" : "rgb_array"}, vec_env_cls = DummyVecEnv)
else:
    # make a single environment:
    env = gym.make("Nav2D-v0", 
                    reward_scale_options = {"rew_head_scale" : rew_head_scale, 
                                            "rew_dist_scale" : rew_dist_scale,
                                            "rew_goal_scale" : rew_goal_scale,
                                            "rew_obst_scale" : rew_obst_scale}, 
                    max_episode_steps = 1000, 
                    render_mode = render_mode)

# evaluation environment:
eval_env = gym.make("Nav2D-v0", max_episode_steps = 1000, render_mode = "rgb_array")

Create model:

In [50]:
# model creation using SB3:
model = SAC(policy = policy, 
            env = env,
            learning_rate = learning_rate,
            buffer_size = buffer_size,
            batch_size = batch_size,
            tau = tau,
            ent_coef = ent_coef,
            train_freq = train_freq,
            learning_starts = learning_starts,
            target_update_interval = target_update_interval,
            gradient_steps = gradient_steps,
            target_entropy = target_entropy,
            action_noise = action_noise, 
            verbose = verbose)

print(type(model))

<class 'stable_baselines3.sac.sac.SAC'>


# **Train the model**

Using the instantiated SB3 model, train on the ``Nav2D-v0`` environment.

In [51]:
# run parameters:
number_of_runs = 100
steps_per_run = 25000
num_evals = 10

# initialize the total reward:
total_reward = []

# model saving parameters:
base_path = os.path.join(os.getcwd(), "results/Nav2D_SAC_SB3_results")
results_path = os.path.join(base_path, f"result_{len(os.listdir(base_path)) + 1}")
os.makedirs(results_path, exist_ok = True)

# using model.learn approach:
for run in tqdm(range(number_of_runs), ncols = 100, colour = "#33FF00", desc = "training progress"):
    # learn every run:
    model.learn(total_timesteps = steps_per_run, reset_num_timesteps = False)

    # evaluate and save every 10th run:
    if run % 10 == 0:
        # after learning:
        eval_reward = eval(eval_env, num_evals = num_evals, model = model)

        # append the eval reward to the total reward:
        total_reward.append(eval_reward)

        # save the model to this directory:
        model.save(os.path.join(results_path, f"run_{run+1}"))

# close environment when done:
env.close()

training progress:   0%|[38;2;51;255;0m                                                    [0m| 0/100 [00:00<?, ?it/s][0m

ep: 1 | required: 297.17 | current: 244.93 | diff: 52.23 | rew_head: -0.73 | rew_dist: -0.97 | total: -1.95                                              

training progress:   0%|[38;2;51;255;0m                                                    [0m| 0/100 [00:02<?, ?it/s][0m


KeyboardInterrupt: 

In [None]:
# if mujoco is angry:
env.close()

# **Visualization**

This section visualizes the learned policy.

In [None]:
visualize = True
testing_length = 10

if visualize:
    # render settings:
    width = 1280
    height = 1280
    default_camera_config = {"azimuth" : 90.0, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}
    camera_id = 2

    DEFAULT_CAMERA = "overhead_camera"
    ENABLE_FRAME = True
    RENDER_EVERY_FRAME = True 

    # make a single environment:
    env = gym.make("Nav2D-v0", 
                render_mode = "human", 
                width = width, 
                height = height,
                default_camera_config = default_camera_config, 
                camera_id = camera_id, 
                max_episode_steps = 1000)

    if DEFAULT_CAMERA=="overhead_camera": pyautogui.press('tab')
    if ENABLE_FRAME: pyautogui.press('e') 
    if not RENDER_EVERY_FRAME: pyautogui.press('d') 

    # for every test episode:
    for eps in range(testing_length):
        obs, _ = env.reset()
        done = False

        # while not done:
        while not done:
            action, _ = model.predict(obs, deterministic = True)
            nobs, reward, term, trunc, _ = env.step(action)
            done = term or trunc

            # advance observation, reset if not:
            obs = nobs if not done else env.reset()
            
            # render for user:
            env.render()

    # close when done:
    env.close()

ep: 1 | required: 133.43 | current: 241.57 | diff: 108.14 | rew_head: -0.60 | rew_dist: -0.34 | total: -1.19                                             

  logger.warn(f"{pre} is not within the observation space.")


ep: 19 | required: 338.43 | current: 346.73 | diff: 8.30 | rew_head: -0.05 | rew_dist: -0.02 | total: -0.32                                               