# Zadanie 5

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]


In [98]:
import numpy as np
import gymnasium as gym

In [99]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,
        action_space: int,
        learning_rate: float = 0.1,
        gamma: float = 0.9,
        epsilon: float = 0.1,
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros(shape=(observation_space, action_space))

    def __call__(self, state: np.ndarray, action: np.ndarray) -> float:
        """Return Q-value of given state and action."""
        return self.q_table[state][action]

    def update(self, state: np.ndarray, action: np.ndarray, reward: float) -> None:
        """Update Q-value of given state and action."""
        self.q_table[state][action] += reward

    def get_best_action(self, state: np.ndarray) -> int:
        """Return action that maximizes Q-value for a given state."""
        return np.argmax(self.q_table[state])
    
    def get_best_move_evaluation(self, state: np.array) ->float:
        return np.max(self.q_table[state])

    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        pass

    def __str__(self):
        return self.__repr__()

In [100]:
def run_episode(solver: QLearningSolver, environment, max_steps):
    state = environment.reset()[0]
    terminated, truncated = False, False
    number_of_steps = 0
    
    while not terminated and not truncated and number_of_steps < max_steps:
        if np.random.random() < solver.epsilon:
            action = environment.action_space.sample()
        else:
            action = solver.get_best_action(state)
            
        next_state, reward, terminated, truncated, info = environment.step(action)
        delta = reward + solver.gamma * solver.get_best_move_evaluation(next_state) - solver(state, action)
        solver.update(state, action, solver.learning_rate * delta)
        state = next_state
        number_of_steps += 1

def q_learning(environment, learning_rate = 0.1, epsilon = 0.1,  max_steps = 200, gamma = 0.9, number_of_episodes = 200):
    solver = QLearningSolver(environment.observation_space.n, environment.action_space.n, learning_rate, gamma, epsilon)
    for i in range(number_of_episodes):
        run_episode(solver, environment, max_steps)
        if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")
    return solver

In [101]:
"""Training the agent"""
env = gym.make("Taxi-v3")
solver = q_learning(env, gamma=0.6, number_of_episodes=100000)

Episode: 99900
CPU times: total: 39.1 s
Wall time: 37.4 s


In [None]:
np.save("solver", solver.q_table)

# Wnioski