# **Introduction**

This notebook is for testing stable baselines.

# **Import Packages**

Import the necessary packages for this implementation.


In [None]:
# import these:
import gymnasium as gym
import numpy as np
import pandas as pd
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from tqdm import trange

# **Define Hyperparameters**

This section defines the hyperparameters.

In [None]:
# create environment:
env_name = "Reacher-v5"
env = Monitor(gym.make(env_name), "./monitor_logs")

# hyperparameters:
policy = "MlpPolicy"
gamma = 0.99
learning_rate = 3e-4
buffer_size = int(1e6)
learning_starts = 10000
train_freq = 1
batch_size = 64
tau = 0.005
ent_coef = 'auto'
target_update_interval = 1
gradient_steps = 1
target_entropy = 'auto'
action_noise = None
random_exploration = 0.0
verbose = 0

# **Create and Use Model**

This section first creates the model using the defined hyperparameters, and then learns a policy.

In [None]:
model = SAC(policy = policy,
            env = env, 
            gamma = gamma,
            learning_rate = learning_rate,
            buffer_size = buffer_size,
            learning_starts = learning_starts,
            train_freq = train_freq,
            batch_size = batch_size,
            tau = tau, 
            ent_coef = ent_coef,
            target_update_interval = target_update_interval,
            gradient_steps = gradient_steps,
            target_entropy = target_entropy,
            action_noise = action_noise,
            verbose = verbose)

Train the model:

In [None]:
for _ in trange(100, ncols = 100, colour = "#33FF00", desc = "training progress"):
    # train for 1000 steps:
    model.learn(total_timesteps = 1000, reset_num_timesteps = False)

    # get the data-frame:
    df = pd.read_csv("monitor_logs.monitor.csv", skiprows = 1)

    # return to user:
    if not df.empty:
        last_reward = df["r"].iloc[-1]
        mean_last_10 = df["r"].tail(10).mean()
        print(f"current episodic reward: {last_reward:.2f} | "
                f"mean of last 10: {mean_last_10:.2f}")

# **Visualization**

This section visualizes the learned policy

In [None]:
# render settings:
width = 1280
height = 1280

match env_name:
    case "InvertedPendulum-v5":
        default_camera_config = {"azimuth" : 90.0, "elevation" : 0.0, "distance" : 3.5, "lookat" : [0.0, 0.0, 0.25]}
        env = gym.make(env_name,
                    healthy_reward = 10.0,
                    render_mode = "human", 
                    width = width,
                    height = height,
                    default_camera_config = default_camera_config)

    case "InvertedDoublePendulum-v5":
        default_camera_config = {"azimuth" : 90.0, "elevation" : 0.0, "distance" : 3.5, "lookat" : [0.0, 0.0, 0.25]}
        env = gym.make(env_name,
                    healthy_reward = 10.0,
                    render_mode = "human", 
                    width = width,
                    height = height,
                    default_camera_config = default_camera_config)
    case "Reacher-v5":
        default_camera_config = {"azimuth" : 90.0, "elevation" : -90.0, "distance" : 1.5, "lookat" : [0.0, 0.0, 0.25]}
        env = gym.make(env_name,
                        render_mode = "human",
                        reward_dist_weight = 1.0,
                        width = width, 
                        height = height,
                        default_camera_config = default_camera_config, 
                        max_episode_steps = 50)  

obs, _ = env.reset()
done = False

while not done:
    action, _ = model.predict(obs, deterministic = True)
    obs, reward, term, trunc, _ = env.step(action)
    done = term or trunc
    env.render()

env.close()