In [1]:
from time import sleep
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.evaluation import evaluate_policy

from grid_environments.windy_gridworld import WindyGridWOrldEnv
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from algorithms.sarsa import Sarsa
from algorithms.n_step_sarsa import NStepSarsa
from grid_environments.cliff_env import CliffEnv

In [2]:
# env = gym.make("CliffWalking-v0")
env = make_vec_env("CliffWalking-v0", n_envs=4)

In [9]:
agent_n_step = NStepSarsa(
        gamma=0.9,
        alpha=0.2,
        epsilon_0=1.0,
        epsilon_min=0.01,
        decay_rate=0.0005,
        action_space=env.action_space.n,
        observation_space=env.observation_space.n,
        n=3
    )

In [12]:
history = agent_n_step.train(env, episodes=100_000, decay_epsilon=True, plot_every=1000)

  0%|          | 0/100000 [00:01<?, ?it/s]


TypeError: 'int' object is not subscriptable

In [3]:
agent = Sarsa(
        gamma=0.9,
        alpha=0.1,
        epsilon_0=1.0,
        epsilon_min=0.01,
        decay_rate=0.0005,
        action_space=env.action_space.n,
        observation_space=env.observation_space.n,
    )

In [4]:
history = agent.train(env, episodes=100_000, decay_epsilon=True, plot_every=1000)

Episode: 0, Score: -7765.0
Episode: 1000, Score: -11244.185
Episode: 2000, Score: -1515.784
Episode: 3000, Score: -435.987
Episode: 4000, Score: -29.311
Episode: 5000, Score: -22.71
Episode: 6000, Score: -21.418
Episode: 7000, Score: -19.133
Episode: 8000, Score: -18.33
Episode: 9000, Score: -17.925
Episode: 10000, Score: -17.471
Episode: 11000, Score: -17.429
Episode: 12000, Score: -17.638
Episode: 13000, Score: -17.38
Episode: 14000, Score: -17.599
Episode: 15000, Score: -17.324
Episode: 16000, Score: -17.6
Episode: 17000, Score: -17.46
Episode: 18000, Score: -17.381
Episode: 19000, Score: -17.379
Episode: 20000, Score: -17.359
Episode: 21000, Score: -17.268
Episode: 22000, Score: -17.288
Episode: 23000, Score: -17.685
Episode: 24000, Score: -17.397
Episode: 25000, Score: -17.274
Episode: 26000, Score: -17.695
Episode: 27000, Score: -17.291
Episode: 28000, Score: -17.354
Episode: 29000, Score: -17.773
Episode: 30000, Score: -17.379
Episode: 31000, Score: -17.595
Episode: 32000, Score

In [9]:
policy_sarsa_n_step = np.array(
        [
            np.argmax(agen_n_step.q_table[key]) 
            for key in np.arange(48)
        ]
).reshape(4, 12)

In [10]:
policy_sarsa_n_step

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1],
       [3, 1, 1, 1, 0, 2, 0, 1, 1, 3, 1, 2],
       [1, 0, 0, 0, 0, 3, 1, 1, 3, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [11]:
policy_sarsa = np.array(
        [
            np.argmax(agent.q_table[key]) 
            for key in np.arange(48)
        ]
).reshape(4, 12)

In [12]:
policy_sarsa

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

---

In [15]:
env_cliff = CliffEnv(render_mode="rgb_array", size_x=6, size_y=5)

In [16]:
agent2 = Sarsa(
        gamma=0.9,
        alpha=0.1,
        epsilon_0=1.0,
        epsilon_min=0.01,
        decay_rate=0.0005,
        action_space=env_cliff.action_space.n,
        observation_space=env_cliff.observation_space.n,
    )

In [18]:
history_2 = agent2.train(env_cliff, episodes=100_000, decay_epsilon=True, plot_every=1000)

Episode: 0, Score: -100.0
Episode: 1000, Score: -94.592
Episode: 2000, Score: -35.43
Episode: 3000, Score: -18.532
Episode: 4000, Score: -9.85
Episode: 5000, Score: -4.797
Episode: 6000, Score: -2.262
Episode: 7000, Score: -2.175
Episode: 8000, Score: -1.26
Episode: 9000, Score: -0.984
Episode: 10000, Score: -0.711
Episode: 11000, Score: -0.259
Episode: 12000, Score: -0.458
Episode: 13000, Score: -0.32
Episode: 14000, Score: -0.609
Episode: 15000, Score: -0.515
Episode: 16000, Score: -0.425
Episode: 17000, Score: -0.422
Episode: 18000, Score: -0.401
Episode: 19000, Score: -0.414
Episode: 20000, Score: -0.534
Episode: 21000, Score: -0.604
Episode: 22000, Score: -0.438
Episode: 23000, Score: -0.137
Episode: 24000, Score: -0.409
Episode: 25000, Score: -0.657
Episode: 26000, Score: -0.442
Episode: 27000, Score: -0.424
Episode: 28000, Score: -0.117
Episode: 29000, Score: -0.385
Episode: 30000, Score: -0.116
Episode: 31000, Score: -0.502
Episode: 32000, Score: -0.622
Episode: 33000, Score: -

In [20]:
agent2.q_table.shape

(30, 4)

In [23]:
policy_sarsa_2 = np.array(
        [
            np.argmax(agent2.q_table[key]) 
            for key in np.arange(30)
        ]
).reshape(6,5)
policy_sarsa_2[env_cliff._cliff] = -1

In [24]:
policy_sarsa_2

array([[ 1,  1,  0,  0,  0],
       [-1,  1,  1,  0,  0],
       [-1,  1,  1,  0,  0],
       [-1,  1,  0,  0,  0],
       [-1,  0,  0,  0,  0],
       [ 0,  3,  3,  3,  3]])