In [106]:
%load_ext autoreload
%autoreload 2
from stable_baselines3 import PPO
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.monitor import Monitor
from wrapper.agent_sb import SB3Wrapper
from stable_baselines3.common.callbacks import CheckpointCallback
from wrapper import ROOT
from wrapper.training import EvalCheckpointCallback
import gymnasium as gym
import numpy as np 

env = Monitor(gym.make("InvertedDoublePendulum-v5"))
exp_tag = "rl_test"
exp_dir = ROOT / "runs" / exp_tag
log_dir = exp_dir / ".logs"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
callback = EvalCheckpointCallback(
    eval_env=env,
    save_freq=20000,
    save_path=exp_dir,
    n_eval_episodes=10,
    verbose=1,
)


agent = PPO("MlpPolicy", env).learn(100000, callback=callback)

[Eval] Step 20000 | Reward: 179.38 +/- 66.93 | Length: 20.50 +/- 7.23
[Eval] Step 40000 | Reward: 318.48 +/- 141.71 | Length: 35.50 +/- 15.19
[Eval] Step 60000 | Reward: 538.20 +/- 398.22 | Length: 59.00 +/- 42.71
[Eval] Step 80000 | Reward: 848.97 +/- 465.19 | Length: 92.30 +/- 49.90
[Eval] Step 100000 | Reward: 2195.16 +/- 1676.99 | Length: 236.70 +/- 179.92


In [108]:
agent = PPO("MlpPolicy", env)
agent_wrapper = SB3Wrapper(agent.load(exp_dir / "best_model.zip"))
for weight in agent_wrapper.get_weights():
    print(weight.shape)

torch.Size([1])
torch.Size([64, 9])
torch.Size([64])
torch.Size([64, 64])
torch.Size([64])
torch.Size([64, 9])
torch.Size([64])
torch.Size([64, 64])
torch.Size([64])
torch.Size([1, 64])
torch.Size([1])
torch.Size([1, 64])
torch.Size([1])


In [None]:
rollout_kwargs = {"env": env, "n_steps": 100, "n_episodes": 10}
surface = agent_wrapper.compute_surface(rollout_kwargs, grid_size=11)

(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -1.0)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.8)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.6)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.4)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, -0.2)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)
(-1.0, 0.0)

In [112]:
import plotly.graph_objects as go

hovertemplate = "<b>Z %{z:.2f}</b> <br>X %{x:.2f}</br>Y %{y:.2f} <extra></extra>"
offsets = agent_wrapper.offsets
fig = go.Figure(
    data=[
        go.Surface(
            x=offsets,
            y=offsets,
            z=surface,
            colorscale="RdBu_r",
            hovertemplate=hovertemplate,
        )
    ]
)

fig.update_traces(
    hovertemplate=hovertemplate,
    hoverlabel=dict(
        bgcolor="rgba(255, 255, 255, 0.9)",  # white with 70% opacity
        font=dict(color="black"),  # optional: set text color
    ),
)
fig.update_layout(
    # title=dict(text="Reward Surface"),
    # autosize=True,
    width=600,
    height=400,
    margin=dict(l=5, r=5, b=5, t=5),
    scene=dict(
        aspectratio=dict(x=1, y=1, z=0.5),  # squeeze z axis
    ),
)


fig

In [88]:
import itertools

test = np.array(list("ABCD"))

temp = list(itertools.product(test, test))
out = list()
for tup in temp:
    out.append("".join(tup))

out = np.array(out).reshape(len(test), len(test))
out

array([['AA', 'AB', 'AC', 'AD'],
       ['BA', 'BB', 'BC', 'BD'],
       ['CA', 'CB', 'CC', 'CD'],
       ['DA', 'DB', 'DC', 'DD']], dtype='<U2')