# Zadanie 5

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]


In [121]:
import numpy as np
import gymnasium as gym
from IPython.display import clear_output
import random

In [122]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,
        action_space: int,
        learning_rate: float = 0.9,
        gamma: float = 0.9,
        epsilon: float = 0.1,
        q_table: np.ndarray = None
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        if q_table is None:
            self.q_table = np.zeros(shape=(observation_space, action_space))
        else:
            self.q_table = q_table

    def __call__(self, state: np.ndarray, action: np.ndarray) -> float:
        """Return Q-value of given state and action."""
        return self.q_table[state][action]

    def update(self, state: np.ndarray, action: np.ndarray, reward: float) -> None:
        """Update Q-value of given state and action."""
        self.q_table[state][action] += reward

    def get_best_action(self, state: np.ndarray) -> int:
        """Return action that maximizes Q-value for a given state."""
        return np.argmax(self.q_table[state])
    
    def get_best_move_evaluation(self, state: np.array) ->float:
        return np.max(self.q_table[state])

    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        pass

    def __str__(self):
        return self.__repr__()

In [123]:
def run_episode(solver: QLearningSolver, environment):
    state = environment.reset()[0]
    terminated, truncated = False, False
    
    while not terminated and not truncated:
        if np.random.random() < solver.epsilon:
            action = environment.action_space.sample()
        else:
            action = solver.get_best_action(state)
            
        next_state, reward, terminated, truncated, _ = environment.step(action)
        delta = reward + solver.gamma * solver.get_best_move_evaluation(next_state) - solver(state, action)
        solver.update(state, action, solver.learning_rate * delta)
        state = next_state

def q_learning(environment, learning_rate = 0.1, epsilon = 0.1, gamma = 0.9, number_of_episodes = 10000):
    solver = QLearningSolver(environment.observation_space.n, environment.action_space.n, learning_rate, gamma, epsilon)
    for i in range(number_of_episodes):
        run_episode(solver, environment)
        if (i+1) % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i+1}")
    return solver

In [124]:
def test_solver(solver: QLearningSolver, environment, number_of_tests : int =100, max_steps = 50):
    successes = 0
    total_steps = 0
    for _ in range(number_of_tests):
        state = environment.reset()[0]
        terminated, truncated = False, False
        steps = 0
        while not terminated and not truncated and steps < max_steps:
            action = solver.get_best_action(state)
            next_state, _, terminated, truncated = environment.step(action)[:4]
            state = next_state
            steps += 1
        if terminated:
            successes +=1
        total_steps += steps
    print(f"Success ratio: {successes/number_of_tests}\nAverage number of steps: {total_steps/number_of_tests}")
        
        

# Testy

In [125]:
env = gym.make("Taxi-v3")
#default hiperparameters
solver = q_learning(env, learning_rate = 0.9, epsilon = 0.1, gamma = 0.9, number_of_episodes = 10000)
np.save("solver", solver.q_table)
test_solver(solver, env, number_of_tests=1000)

Episode: 10000
Success ratio: 1.0
Average number of steps: 13.123


In [126]:
solver = q_learning(env, gamma=0.9, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 4000
Success ratio: 0.998
Average number of steps: 13.155


In [127]:
solver = q_learning(env, gamma=0.8, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 4000
Success ratio: 0.934
Average number of steps: 15.366


In [128]:
solver = q_learning(env, gamma=0.7, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 4000
Success ratio: 0.766
Average number of steps: 21.186


In [129]:
solver = q_learning(env, gamma=0.5, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 4000
Success ratio: 0.543
Average number of steps: 29.307


In [130]:
solver = q_learning(env, gamma=0.1, number_of_episodes=4000)
test_solver(solver, env, number_of_tests=1000)

Episode: 4000
Success ratio: 0.314
Average number of steps: 37.937


In [131]:
solver = q_learning(env, epsilon=0.1, number_of_episodes=2000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 0.83
Average number of steps: 18.901


In [132]:
solver = q_learning(env, epsilon=0.2, number_of_episodes=2000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 0.846
Average number of steps: 18.302


In [133]:
solver = q_learning(env, epsilon=0.3, number_of_episodes=2000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 0.863
Average number of steps: 17.725


In [144]:
solver = q_learning(env, epsilon=0.7, number_of_episodes=2000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 0.92
Average number of steps: 15.724


In [135]:
solver = q_learning(env, epsilon=0.9, number_of_episodes=2000)
test_solver(solver, env, number_of_tests=1000)

Episode: 2000
Success ratio: 1.0
Average number of steps: 13.117


In [136]:
solver = q_learning(env, epsilon=0.9, number_of_episodes=800)
test_solver(solver, env, number_of_tests=1000)

Episode: 800
Success ratio: 0.319
Average number of steps: 37.408


In [137]:
solver = q_learning(env, epsilon=0.1, number_of_episodes=800)
test_solver(solver, env, number_of_tests=1000)

Episode: 800
Success ratio: 0.253
Average number of steps: 40.055


In [138]:
solver = q_learning(env, learning_rate=0.1, number_of_episodes=1000)
test_solver(solver, env, number_of_tests=1000)

Episode: 1000
Success ratio: 0.373
Average number of steps: 35.487


In [139]:
solver = q_learning(env, learning_rate=0.3, number_of_episodes=1000)
test_solver(solver, env, number_of_tests=1000)

Episode: 1000
Success ratio: 0.896
Average number of steps: 16.64


In [140]:
solver = q_learning(env, learning_rate=0.5, number_of_episodes=1000)
test_solver(solver, env, number_of_tests=1000)

Episode: 1000
Success ratio: 0.95
Average number of steps: 14.858


In [141]:
solver = q_learning(env, learning_rate=0.7, number_of_episodes=1000)
test_solver(solver, env, number_of_tests=1000)

Episode: 1000
Success ratio: 0.973
Average number of steps: 13.992


In [143]:
solver = q_learning(env, learning_rate=0.9, number_of_episodes=1000)
test_solver(solver, env, number_of_tests=1000)

Episode: 1000
Success ratio: 0.958
Average number of steps: 14.558


# Wnioski