In [10]:
import random
import gymnasium as gym
import numpy as np

In [11]:
env = gym.make('Taxi-v3')

In [12]:
alpha = 0.9
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.9995
min_epsilon = 0.01
num_episodes = 10000
max_steps = 100

In [13]:
# 5x5 grid -> 25 positions * 5 * 4
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [14]:
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(q_table[state])

In [15]:
for episode in range(num_episodes):
    state, _ = env.reset()

    done = False

    for step in range(max_steps):
        action = choose_action(state)

        next_state, reward, done, truncated, info = env.step(action)

        old_value = q_table[state, action]

        next_max = np.max(q_table[next_state, :])

        q_table[state, action] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

        state = next_state

        if done:
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

In [16]:
env = gym.make('Taxi-v3', render_mode='human')

In [17]:
for episode in range(5):
    state, _ = env.reset()
    done = False

    print('Episode', episode)
    
    for step in range(max_steps):
        env.render()
        action = np.argmax(q_table[state, :])
        next_state, reward, done, truncated, info = env.step(action)
        state = next_state

        if done or truncated:
            env.render()
            print('Finished episode', episode, 'with reward', reward)
            break

Episode 0
Finished episode 0 with reward 20
Episode 1
Finished episode 1 with reward 20
Episode 2
Finished episode 2 with reward 20
Episode 3
Finished episode 3 with reward 20
Episode 4
Finished episode 4 with reward 20


In [19]:
env.close()