In [1]:
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

In [5]:
env = gym.make("Taxi-v3", render_mode="ansi")
env.reset()

(346, {'prob': 1.0, 'action_mask': array([1, 1, 0, 1, 0, 0], dtype=int8)})

In [6]:
print(env.render())

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[35mY[0m| : |B: |
+---------+




In [7]:
action_space = env.action_space.n
state_space = env.observation_space.n

In [10]:
q_table = np.zeros([state_space, action_space])

In [9]:
alpha = 0.1
gamma = 0.6
epsilon = 0.1

In [11]:
for i in tqdm(range(1, 100001)):
    state, _ = env.reset()
    done = False

    while not done:

        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, done, info, _ = env.step(action)

        q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

        state = next_state

100%|██████████| 100000/100000 [00:55<00:00, 1794.57it/s]


In [12]:
total_epoch, total_penalties = 0, 0
episodes = 100

for i in tqdm(range(episodes)):
    state, _ = env.reset()
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = np.argmax(q_table[state])

        next_state, reward, done, info, _ = env.step(action)
        state = next_state
        if reward == -10:
            penalties += 1
        
        epochs += 1

    total_epoch += epochs
    total_penalties += penalties

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epoch / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
        

100%|██████████| 100/100 [00:00<00:00, 1978.45it/s]

Results after 100 episodes:
Average timesteps per episode: 13.23
Average penalties per episode: 0.0



