In [None]:
import numpy as np
import gymnasium as gym

env = gym.make("Taxi-v3")
env.reset()

(43, {'prob': 1.0, 'action_mask': array([1, 0, 1, 0, 0, 0], dtype=int8)})

In [109]:
n_states  = env.observation_space.n   # 500
n_actions = env.action_space.n        # 6
print(env.observation_space.n)
print(env.action_space.n)
q_table   = np.zeros((n_states, n_actions))

500
6


In [110]:
alpha   = 0.1    # learning rate
gamma   = 0.6    # discount factor
epsilon = 0.1    # exploration rate
episodes = 10000

In [115]:
total_timesteps = []
total_penalties = []
total_rewards = []


In [116]:

for ep in range(episodes):
    state,_ = env.reset()
    done = False
    
    timesteps=0
    penalties=0
    total_reward=0

    while not done:
        # ε-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() #explore action space
        else:
            action = np.argmax(q_table[state]) #use learned values

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated
        total_reward += reward

        if reward == -10:
            penalties += 1

        # Q-learning update
        old_value = q_table[state, action]
        next_max  = np.max(q_table[next_state])
        q_table[state, action] = old_value + alpha * (reward + gamma * next_max - old_value)

        state = next_state
        timesteps += 1

    # record
    total_timesteps.append(timesteps)
    total_penalties.append(penalties)
    total_rewards.append(total_reward)

In [117]:
avg_timesteps = sum(total_timesteps) / episodes
avg_penalties = sum(total_penalties) / episodes
avg_rewards   = sum(total_rewards) / episodes

# Display results
print("Results after", episodes, "episodes:")
print(f"Average timesteps per episode: {avg_timesteps:.2f}")
print(f"Average penalties per episode: {avg_penalties:.2f}")
print(f"Average reward per episode:    {avg_rewards:.2f}")

Results after 10000 episodes:
Average timesteps per episode: 14.80
Average penalties per episode: 0.44
Average reward per episode:    2.20
