In [14]:
import gymnasium as gym
import numpy as np
import tqdm

In [15]:
env = gym.make("Taxi-v3")

In [16]:
observation, info = env.reset()
print(f'observation: {observation}')
print(f'info: {info}')

observation: 404
info: {'prob': 1.0, 'action_mask': array([0, 1, 0, 0, 0, 0], dtype=int8)}


In [17]:
n_observations = env.observation_space.n
n_actions = env.action_space.n

print(f'Available actions: {n_actions}')
print(f'Available observations: {n_observations}')

Available actions: 6
Available observations: 500


In [18]:
Q_matrix = np.zeros((n_observations, n_actions))

In [None]:
learning_rate = 0.4
discount_factor = 0.6
epsilon = 0.7
min_epsilon = 0.2
decay_rate = 1e-4

In [20]:
for e in tqdm.tqdm(range(10000)):
     current_state, _ = env.reset()
     done = False

     while not done:
         if np.random.uniform(0, 1) < epsilon:
             action = env.action_space.sample()

         else:
             action = np.argmax(Q_matrix[current_state])

         next_state, reward, done, _, _ = env.step(action)

         Q_matrix[current_state, action] = (1.0 - learning_rate) * Q_matrix[current_state, action] + learning_rate * (reward + discount_factor * max(Q_matrix[next_state]))
         current_state = next_state

     epsilon = max(min_epsilon, np.exp(-decay_rate * e))

print('Training ended with Q matrix:')
print(Q_matrix)

100%|██████████| 10000/10000 [00:10<00:00, 995.60it/s]

Training ended with Q matrix:
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.87195709  -2.67422442  -2.87195709  -2.67422442  -2.39174917
  -11.67422442]
 [ -1.411733    -0.58819     -1.411733    -0.58819      0.5883
   -9.58819   ]
 ...
 [  0.5883       2.269        0.5883      -0.58819     -8.4117
   -8.4117    ]
 [ -2.39174917  -1.9882131   -2.39174917  -1.9882131  -11.39174917
  -11.39174917]
 [  8.1          4.67         8.1         13.          -0.9
   -0.9       ]]





In [None]:
episodes = 10
total_reward = 0
total_step = 0
env = gym.make("Taxi-v3", render_mode="human")

for _ in tqdm.tqdm(range(episodes)):
    done = False
    episode_reward = 0
    episode_step = 0
    current_state, _ = env.reset()

    while not done:
        action = np.argmax(Q_matrix[current_state])
        current_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        episode_step += 1


    total_step += episode_step
    total_reward += episode_reward

avg_steps = total_step / episodes
avg_rewards = total_reward / episodes

print(f"Nombre moyen d'étapes: {avg_steps:.2f}")
print(f"Récompense moyenne: {avg_rewards:.2f}")

env.close()

100%|██████████| 10/10 [00:36<00:00,  3.70s/it]

Nombre moyen d'étapes: 13.70
Récompense moyenne: 7.30





: 