# **Introduction**

This notebook serves as an implementation of Soft Actor-Critic (SAC) on the custom-developed 2D navigation environment, titled ``Nav2D-v0``. The goal of this implementation is to quantify the performance of SAC in a simple 2D navigational  task, such that it can be used for incremental learning within subsequent environments.

# **Imports**

This section imports the necessary packages for this implementation.

In [6]:
# import gymnasium related packages:
import gymnasium as gym
from gymnasium.utils.env_checker import check_env

# import custom environments and wrappers:
import nav2d

# import stablebaselines stuff:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import Monitor, make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

# other necessary imports:
import torch
from tqdm import tqdm
import pyautogui
import numpy as np
import pandas as pd
import os

# **Function Definitions**

This section defines the functions required for this implementation.

In [7]:
# evaluation function:
def eval(env: gym.Env, 
         num_evals: int, 
         model):
    # reward list:
    eval_rew_hist = []

    # for each episode in the num_evals:
    for _ in range(num_evals):
        obs, _ = env.reset()
        done = False

        # initialize episodic reward:
        eval_rew = 0

        # while False:
        while not done:
            # get action and step:
            action, _ = model.predict(obs, deterministic = True)
            nobs, reward, term, trunc, _ = env.step(action)
            done = term or trunc
            
            # advance reward:
            eval_rew += reward

            # advance observation, reset if not:
            obs = nobs if not done else env.reset()
    
        # append:
        eval_rew_hist.append(eval_rew)

    return np.mean(eval_rew_hist).round(3)

# **Environment Definition and Hyperparameters**

This section defines and verifies the environment, defines the hyperparameters for the model, and creates a model.

In [8]:
# make the environment:
env = gym.make("Nav2D-v0")

# check the environment:
try: 
    check_env(env.unwrapped)
    print(f"Environment passes all checks!")
except Exception as e:
    print(f"Environment has the following issues: \n{e}")

Environment has the following issues: 
The `.np_random` is not properly been updated after step.


Define hyperparameters:

In [9]:
# hyperparameters:
policy = "MlpPolicy"
gamma = 0.99
actor_lr = 1e-4
critic_lr = 1e-5
buffer_size = int(1e6)
batch_size = 4096
tau = 5e-3
ent_coef = "auto_0.1"
train_freq = 1
learning_starts = 0
target_update_interval = 1
gradient_steps = 4
target_entropy = -env.action_space.shape[0]
action_noise = None
verbose = 0

# scaling on rewards:
rew_head_scale = 2.5
rew_head_approach_scale = 50
rew_dist_scale = 250.0
rew_goal_scale = 5000.0
rew_obst_scale = -1000.0

# vectorize or nah:
vectorize = False
n_envs = 4
render_mode = "human"
max_episode_steps = 1000
gpu = True

Make envs:

In [10]:
# if using a vectorized environment:
if vectorize:
    # make the vectorized environments:
    print("making vectorized environments!")
    env = make_vec_env("Nav2D-v0", n_envs = n_envs, env_kwargs = {"reward_scale_options" : {"rew_head_scale" : rew_head_scale, 
                                                                                            "rew_head_approach_scale" : rew_head_approach_scale,
                                                                                            "rew_dist_scale" : rew_dist_scale, 
                                                                                            "rew_goal_scale" : rew_goal_scale, 
                                                                                            "rew_obst_scale" : rew_obst_scale},
                                                                  "max_episode_steps" : max_episode_steps,
                                                                  "render_mode" : "rgb_array"}, vec_env_cls = DummyVecEnv)
else:
    # make a single environment:
    print("making single environment!")
    env = gym.make("Nav2D-v0", 
                    reward_scale_options = {"rew_head_scale" : rew_head_scale, 
                                            "rew_head_approach_scale" : rew_head_approach_scale,
                                            "rew_dist_scale" : rew_dist_scale,
                                            "rew_goal_scale" : rew_goal_scale,
                                            "rew_obst_scale" : rew_obst_scale}, 
                    max_episode_steps = max_episode_steps, 
                    render_mode = render_mode)

# evaluation environment:
eval_env = gym.make("Nav2D-v0", max_episode_steps = max_episode_steps, render_mode = "rgb_array", is_eval = True)

making single environment!


Create model:

In [11]:
# model creation using SB3:
model = SAC(policy = policy, 
            env = env,
            buffer_size = buffer_size,
            batch_size = batch_size,
            tau = tau,
            ent_coef = ent_coef,
            train_freq = train_freq,
            learning_starts = learning_starts,
            target_update_interval = target_update_interval,
            gradient_steps = gradient_steps,
            target_entropy = target_entropy,
            action_noise = action_noise, 
            verbose = verbose,
            device = "cuda" if gpu else "cpu")

model.actor.optimizer = torch.optim.Adam(model.actor.parameters(), lr = actor_lr)
model.critic.optimizer = torch.optim.Adam(model.critic.parameters(), lr = critic_lr)

print(f"created model using: {model.device} as device")

created model using: cuda as device


# **Train the model**

Using the instantiated SB3 model, train on the ``Nav2D-v0`` environment.

In [12]:
# run parameters:
number_of_runs = 10
steps_per_run = 25000
num_evals = 10

# initialize the total reward:
total_reward = []

# model saving parameters:
base_path = os.path.join(os.getcwd(), "results/Nav2D_SAC_SB3_results")
results_path = os.path.join(base_path, f"result_{len(os.listdir(base_path)) + 1}")

# using model.learn approach:
for run in tqdm(range(number_of_runs), ncols = 100, colour = "#33FF00", desc = "training progress"):
    # learn every run:
    model.learn(total_timesteps = steps_per_run, reset_num_timesteps = False)

    # # evaluate and save every 10th run:
    # if run % max(int(number_of_runs/10), 1) == 0:
    #     # NOT SURE IF THIS SHOULD BE IN HERE: os.makedirs(results_path, exist_ok = True)
    #     # after learning:
    #     eval_reward = eval(eval_env, num_evals = num_evals, model = model)

    #     # append the eval reward to the total reward:
    #     total_reward.append(eval_reward)

    #     # save the model to this directory:
    #     model.save(os.path.join(results_path, f"run_{run}"))

# close environment when done:
env.close()

training progress:   0%|[38;2;51;255;0m                                                     [0m| 0/10 [00:00<?, ?it/s][0m

 @ episode 30 | rew_head: 0.0000 | head_diff: 0.00527 | rew_head_approach: 0.2637 | vel: 0.04616 | pos_diff: 0.00000 | rew_approach: 0.0000 | total: 0.21374                                                                                  

c:\Users\mtidd2\Desktop\ROS2_DRL_Navigation\.venv\Lib\site-packages\glfw\__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'


 @ episode 30 | rew_head: 0.0000 | head_diff: 0.00000 | rew_head_approach: 0.0000 | vel: 0.24776 | pos_diff: 0.00000 | rew_approach: 0.0000 | total: -0.04999                                                                              

training progress:   0%|[38;2;51;255;0m                                                     [0m| 0/10 [19:24<?, ?it/s][0m

 @ episode 30 | rew_head: 0.0000 | head_diff: 0.00469 | rew_head_approach: 0.2346 | vel: 0.07002 | pos_diff: 0.00000 | rew_approach: 0.0000 | total: 0.18461                                                                              




KeyboardInterrupt: 

In [None]:
# if mujoco is angry:
env.close()

# **Visualization**

This section visualizes the learned policy.

In [None]:
visualize = True
testing_length = 10

if visualize:
    # render settings:
    width = 1280
    height = 1280
    default_camera_config = {"azimuth" : 90.0, "elevation" : -90.0, "distance" : 3, "lookat" : [0.0, 0.0, 0.0]}
    camera_id = 2

    DEFAULT_CAMERA = "overhead_camera"
    ENABLE_FRAME = True
    RENDER_EVERY_FRAME = True 

    # make a single environment:
    env = gym.make("Nav2D-v0", 
                render_mode = "human", 
                width = width, 
                height = height,
                default_camera_config = default_camera_config, 
                camera_id = camera_id, 
                max_episode_steps = max_episode_steps, 
                is_eval = False)

    if DEFAULT_CAMERA=="overhead_camera": pyautogui.press('tab')
    if ENABLE_FRAME: pyautogui.press('e') 
    if not RENDER_EVERY_FRAME: pyautogui.press('d') 

    # for every test episode:
    for eps in range(testing_length):
        obs, _ = env.reset()
        done = False

        # while not done:
        while not done:
            action, _ = model.predict(obs, deterministic = True)
            nobs, reward, term, trunc, _ = env.step(action)
            done = term or trunc

            # advance observation, reset if not:
            obs = nobs if not done else env.reset()
            
            # render for user:
            env.render()

    # close when done:
    env.close()