### XRL Baby Steps Notebook

In this notebook, we implement the baby steps attempt to combine three XRL methods,
in order to explain deep RL.

- SVERL
- Group-SHAPLEY
- Shapley Explainability on Data Manifold

In [13]:
import gymnasium as gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback


In [14]:

# Hyperparameters
hyperparams = {
    "policy": 'MlpPolicy',
    "n_steps": 1024,
    "batch_size": 64,
    "gae_lambda": 0.98,
    "gamma": 0.999,
    "n_epochs": 4,
    "ent_coef": 0.01,
}

# Training parameters
n_envs = 16
n_timesteps = 1e6
save_freq = max(int(n_timesteps / 10), 1000)
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)

# Create vectorized environment
env = make_vec_env(
    "LunarLander-v3", 
    n_envs=n_envs, 
    vec_env_cls=SubprocVecEnv,
    monitor_dir=log_dir
)

# Create model
model = PPO(
    env=env,
    **hyperparams,
    verbose=1,
    tensorboard_log=log_dir,
    device='cpu' # PPO is intended to be run on CPU
)

# Callback for saving checkpoints
checkpoint_callback = CheckpointCallback(
    save_freq=save_freq,
    save_path=log_dir,
    name_prefix="rl_model"
)


Using cpu device


In [15]:

# Train the model
model.learn(
    total_timesteps=n_timesteps,
    callback=checkpoint_callback,
    tb_log_name="PPO"
)

# Save the final model
savepath = os.path.join(log_dir, "ppo_lunarlander_final")
model.save(savepath)

# Close the environment
env.close()

Logging to logs/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.8     |
|    ep_rew_mean     | -195     |
| time/              |          |
|    fps             | 5644     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 87.3         |
|    ep_rew_mean          | -135         |
| time/                   |              |
|    fps                  | 4134         |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0060139652 |
|    clip_fraction        | 0.0486       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.000637

Visualize LunarLandar

In [None]:
model = PPO.load(savepath, device='cpu')

env = gym.make("LunarLander-v3", render_mode="human")
obs = env.reset()[0]

T = 400 # number of timesteps
for _ in range(T):
    action = model.predict(obs)[0]
    state, reward, terminated, truncated, _ = env.step(action)
    env.render()
    if terminated or truncated:
        obs = env.reset()[0]
env.close()

### 8 Cart Pole Balancing Agent (PPO)

In [None]:
# Parallel environments
vec_env = make_vec_env("CartPole-v1", n_envs=8)


"""
# Set hyperparameters
CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 32
  batch_size: 256
  gae_lambda: 0.8
  gamma: 0.98
  n_epochs: 20
  ent_coef: 0.0
  learning_rate: lin_0.001
  clip_range: lin_0.2
"""

# Create the agent
model = PPO("MlpPolicy", vec_env, verbose=1)
model.gamma = 0.98
model.gae_lambda = 0.8
model.learning_rate = 0.001
model.learn(total_timesteps=float(1e5))


In [None]:
model.save("ppo_cartpole")

In [None]:
model = PPO.load("ppo_cartpole")
obs = vec_env.reset()

T = 100 # number of timesteps
for _ in range(T):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")
vec_env.close()