In [None]:
from tqdm import tqdm
import gymnasium as gym
import copy 
from src.Sarsa import SarsaAgent
from src.SarsaLambda import SarsaLambdaAgent
from src.QLearning import QLearningAgent
from src.QLearningLambda import QLearningLambdaAgent
from src.Visualizing import training_visualize

In [None]:
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay: float = start_epsilon / (n_episodes / 2)
final_epsilon = 0.0
lambda_factor = 0.5

In [None]:
env: gym.Env = gym.make('CliffWalking-v0', max_episode_steps=1_000)
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)

agent_ql = QLearningAgent(
    action_space=copy.deepcopy(env.action_space),
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)
agent_sarsa = SarsaAgent(
    action_space=copy.deepcopy(env.action_space),
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)
agent_ql_lambda = QLearningLambdaAgent(
    action_space=copy.deepcopy(env.action_space),
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
    lambda_factor=lambda_factor,
)
agent_sarsa_lambda = SarsaLambdaAgent(
    action_space=copy.deepcopy(env.action_space),
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
    lambda_factor=lambda_factor,
)

actions: list[str] = ["Up", "Right", "Down", "Left"]

In [None]:
for episode in tqdm(range(n_episodes)):
    curr_observation, info = env.reset()
    curr_action: int = agent_ql.get_action(curr_observation)
    # play one episode
    while True:
        # act upon the enviromment
        next_observation, reward, terminated, truncated, info = env.step(curr_action)
        is_terminal: bool = terminated or truncated
        # select next action
        next_action: int = agent_ql.get_action(next_observation)
        # update the agent
        agent_ql.update(curr_observation, curr_action, reward, terminated, next_observation, next_action)
        # update the current observation and action
        curr_observation = next_observation
        curr_action = next_action
        # end the episode
        if (is_terminal):
            break
    # reduce exploration factor
    agent_ql.decay_epsilon()

In [None]:
training_visualize(env, agent_ql, 'green', 10)

In [None]:
for episode in tqdm(range(n_episodes)):
    curr_observation, info = env.reset()
    curr_action: int = agent_sarsa.get_action(curr_observation)
    # play one episode
    while True:
        # act upon the enviromment
        next_observation, reward, terminated, truncated, info = env.step(curr_action)
        is_terminal: bool = terminated or truncated
        # select next action
        next_action: int = agent_sarsa.get_action(next_observation)
        # update the agent
        agent_sarsa.update(curr_observation, curr_action, reward, terminated, next_observation, next_action)
        # update the current observation and action
        curr_observation = next_observation
        curr_action = next_action
        # end the episode
        if (is_terminal):
            break
    # reduce exploration factor
    agent_sarsa.decay_epsilon()

In [None]:
training_visualize(env, agent_sarsa, 'yellow',10_000)

In [None]:
for episode in tqdm(range(n_episodes)):
    curr_observation, info = env.reset()
    curr_action: int = agent_ql_lambda.get_action(curr_observation)
    # play one episode
    while True:
        # act upon the enviromment
        next_observation, reward, terminated, truncated, info = env.step(curr_action)
        is_terminal: bool = terminated or truncated
        # select next action
        next_action: int = agent_ql_lambda.get_action(next_observation)
        # update the agent
        agent_ql_lambda.update(curr_observation, curr_action, reward, terminated, next_observation, next_action)
        # update the current observation and action
        curr_observation = next_observation
        curr_action = next_action
        # end the episode
        if (is_terminal):
            break
    # reduce exploration factor
    agent_ql_lambda.decay_epsilon()

In [None]:
training_visualize(env, agent_ql_lambda, 'green', 10_000)

In [None]:
for episode in tqdm(range(n_episodes)):
    curr_observation, info = env.reset()
    curr_action: int = agent_sarsa_lambda.get_action(curr_observation)
    # play one episode
    while True:
        # act upon the enviromment
        next_observation, reward, terminated, truncated, info = env.step(curr_action)
        is_terminal: bool = terminated or truncated
        # select next action
        next_action: int = agent_sarsa_lambda.get_action(next_observation)
        # update the agent
        agent_sarsa_lambda.update(curr_observation, curr_action, reward, terminated, next_observation, next_action)
        # update the current observation and action
        curr_observation = next_observation
        curr_action = next_action
        # end the episode
        if (is_terminal):
            break
    # reduce exploration factor
    agent_sarsa_lambda.decay_epsilon()

In [None]:
training_visualize(env, agent_sarsa_lambda, 'yellow', 10_000)

In [None]:
agent = agent_ql
curr_obs, info = env.reset()
# play one episode
while True:
    next_action: int = agent.get_action(curr_obs)
    next_obs, reward, terminated, truncated, info = env.step(next_action)
    curr_obs = next_obs
    if (terminated or truncated):
        break
    