# Zadanie 5

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]


In [28]:
import numpy as np
import gymnasium as gym
from IPython.display import clear_output

In [29]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,
        action_space: int,
        learning_rate: float = 0.1,
        gamma: float = 0.9,
        epsilon: float = 0.1,
        q_table: np.ndarray = None
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        if q_table is None:
            self.q_table = np.zeros(shape=(observation_space, action_space))
        else:
            self.q_table = q_table

    def __call__(self, state: np.ndarray, action: np.ndarray) -> float:
        """Return Q-value of given state and action."""
        return self.q_table[state][action]

    def update(self, state: np.ndarray, action: np.ndarray, reward: float) -> None:
        """Update Q-value of given state and action."""
        self.q_table[state][action] += reward

    def get_best_action(self, state: np.ndarray) -> int:
        """Return action that maximizes Q-value for a given state."""
        return np.argmax(self.q_table[state])
    
    def get_best_move_evaluation(self, state: np.array) ->float:
        return np.max(self.q_table[state])

    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        pass

    def __str__(self):
        return self.__repr__()

In [46]:
def run_episode(solver: QLearningSolver, environment, max_steps):
    state = environment.reset()[0]
    terminated, truncated = False, False
    number_of_steps = 0
    
    while not terminated and not truncated and number_of_steps < max_steps:
        if np.random.random() < solver.epsilon:
            action = environment.action_space.sample()
        else:
            action = solver.get_best_action(state)
            
        next_state, reward, terminated, truncated, info = environment.step(action)
        delta = reward + solver.gamma * solver.get_best_move_evaluation(next_state) - solver(state, action)
        solver.update(state, action, solver.learning_rate * delta)
        state = next_state
        number_of_steps += 1

def q_learning(environment, learning_rate = 0.1, epsilon = 0.1,  max_steps = 200, gamma = 0.9, number_of_episodes = 100000):
    solver = QLearningSolver(environment.observation_space.n, environment.action_space.n, learning_rate, gamma, epsilon)
    for i in range(number_of_episodes):
        run_episode(solver, environment, max_steps)
        if (i+1) % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i+1}")
    return solver

In [30]:
"""Training the agent"""
env = gym.make("Taxi-v3")
solver = q_learning(env, gamma=0.6, number_of_episodes=100000)
np.save("solver", solver.q_table)

Episode: 9900


In [11]:
solver = QLearningSolver(env.observation_space.n, env.action_space.n, q_table=np.load("solver.npy"))

In [16]:
def test_solver(solver: QLearningSolver, environment, number_of_tests : int =100, max_steps = 50,randomness_seed : int = 0):
    np.random.seed(randomness_seed)
    successes = 0
    total_steps = 0
    for _ in range(number_of_tests):
        state = environment.reset()[0]
        terminated, truncated = False, False
        steps = 0
        while not terminated and not truncated and steps < max_steps:
            action = solver.get_best_action(state)
            next_state, _, terminated, truncated = environment.step(action)[:4]
            state = next_state
            steps += 1
        if terminated:
            successes +=1
        total_steps += steps
    print(f"Success ratio: {successes/number_of_tests}\nAverage number of steps: {total_steps/number_of_tests}")
        
        

In [45]:
solver = q_learning(env, gamma=0.9, number_of_episodes=10000)
test_solver(solver, env, number_of_tests=1000)

Episode: 9999
Success ratio: 1.0
Average number of steps: 13.043


In [43]:
solver = q_learning(env, gamma=0.8, number_of_episodes=10000)
test_solver(solver, env, number_of_tests=1000)

Success ratio: 0.986
Average number of steps: 13.622


In [37]:
solver = q_learning(env, gamma=0.99, number_of_episodes=10000)
test_solver(solver, env, number_of_tests=1000)

Episode: 9900
Success ratio: 1.0
Average number of steps: 13.047


In [52]:
solver = q_learning(env, gamma=0.99, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 0.912
Average number of steps: 16.052


# Wnioski