# Zadanie 6

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]

Polecane źródła - teoria + intuicja:
- https://distill.pub/2019/paths-perspective-on-value-learning/
- https://www.youtube.com/watch?v=0iqz4tcKN58&ab_channel=SteveBrunton

In [1]:
import numpy as np
import gymnasium as gym

In [None]:
class Environtment:

    def run(episodes: int, training: bool, render: bool) -> None:
        env = gym.make('Taxi-v3', render_mode='human' if render else None)

In [4]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,     # X
        action_space: int,          # A
        learning_rate: float = 0.1,
        gamma: float = 0.9,
        epsilon: float = 0.1,
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon

        self.Q_table = np.zeros((observation_space, action_space))

    def __call__(self, state: int, action: int) -> float:
        """Return Q-value of given state and action."""
        return self.Q_table[state, action]
    
    def _best_next_action(self, next_state: int) -> int:
        return np.argmax(self.Q_table[next_state])
    
    def _calc_td_target(
            self,
            next_state: int,
            reward: float
        ) -> float:
        """
        Oblicz Temporal Difference Target:
        TD_target = r + y * max(Q(s', a'))
        """

        best_action: int = self._best_next_action(next_state)

        return reward + self.gamma * self.Q_table[next_state, best_action]

    def update(
            self,
            state: int,
            action: int,
            next_state: int,
            reward: float
        ) -> None:
        """
        Update Q-value of given state and action.

        Q(s, a) ← Q(s, a) + alpha * [r + y * max(Q(s', a')) - Q(s, a)]

        """
        Q: int = self.Q_table[state, action]
        TD_target: int = self._calc_td_target(next_state, reward)

        self.Q_table[state, action] = Q + self.learning_rate * (TD_target - Q)
    
    def _exploration(self) -> int:
        return np.random.choice(self.action_space)
    
    def _exploitation(self, state: np.ndarray) -> int:
        return np.argmax(self.Q_table[state])

    def get_best_action(self, state: np.ndarray) -> np.ndarray:
        """Return action that maximizes Q-value for a given state."""
        if np.random.rand() < self.epsilon:
            return self._exploration()

        return self._exploitation(state)

    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        return f"QLearningSolver(observation_space={self.observation_space}, action_space={self.action_space})"

    def __str__(self):
        return self.__repr__()

# Eksperymenty

# Wnioski