In [1]:
import gymnasium as gym
import numpy as np
import tqdm

In [2]:
env = gym.make("Taxi-v3")

In [3]:
observation, info = env.reset()
print(f'observation: {observation}')
print(f'info: {info}')

observation: 434
info: {'prob': 1.0, 'action_mask': array([0, 1, 1, 0, 0, 0], dtype=int8)}


In [4]:
n_observations = env.observation_space.n
n_actions = env.action_space.n

print(f'Available actions: {n_actions}')
print(f'Available observations: {n_observations}')

Available actions: 6
Available observations: 500


In [5]:
Q_matrix = np.zeros((n_observations, n_actions))

In [6]:
learning_rate = 0.4
discount_factor = 0.6
epsilon = 0.7
min_epsilon = 0.2
decay_rate = 1e-4

In [7]:
for e in tqdm.tqdm(range(10000)):
     current_state, _ = env.reset()
     done = False

     while not done:
         if np.random.uniform(0, 1) < epsilon:
             action = env.action_space.sample()

         else:
             action = np.argmax(Q_matrix[current_state])

         next_state, reward, done, _, _ = env.step(action)

         Q_matrix[current_state, action] = (1.0 - learning_rate) * Q_matrix[current_state, action] + learning_rate * (reward + discount_factor * max(Q_matrix[next_state]))
         current_state = next_state

     epsilon = max(min_epsilon, np.exp(-decay_rate * e))

print('Training ended with Q matrix:')
print(Q_matrix)

100%|██████████| 10000/10000 [00:10<00:00, 955.60it/s]

Training ended with Q matrix:
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.41837066  -2.3639511   -2.41837066  -2.3639511   -2.27325184
  -11.3639511 ]
 [ -1.870144    -1.45024     -1.870144    -1.45024     -0.7504
  -10.45024   ]
 ...
 [ -0.7504       0.416       -0.7504      -1.45024     -9.7504
   -9.7504    ]
 [ -2.27325184  -2.1220864   -2.27325184  -2.1220864  -11.27325184
  -11.27325184]
 [  5.6          2.36         5.6         11.          -3.4
   -3.4       ]]





In [8]:
episodes = 10
total_reward = 0
total_step = 0
env = gym.make("Taxi-v3", render_mode="human")

for _ in tqdm.tqdm(range(episodes)):
    done = False
    episode_reward = 0
    episode_step = 0
    current_state, _ = env.reset()

    while not done:
        action = np.argmax(Q_matrix[current_state])
        current_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        episode_step += 1


    total_step += episode_step
    total_reward += episode_reward

avg_steps = total_step / episodes
avg_rewards = total_reward / episodes

print(f"Nombre moyen d'étapes: {avg_steps:.2f}")
print(f"Récompense moyenne: {avg_rewards:.2f}")

env.close()

 40%|████      | 4/10 [00:13<00:19,  3.27s/it]

: 