In [7]:
"""
taxi_qlearning.py
Q-learning for Taxi-v3
Requirements: gym, numpy
Run: python taxi_qlearning.py
"""

import gymnasium as gym
import numpy as np
import random

def q_learning_taxi(env_name="Taxi-v3",render_mode="rgb_array", episodes=20000, max_steps=200,
                    alpha=0.7, gamma=0.95, epsilon=1.0, min_epsilon=0.01, decay=0.9995):
    env = gym.make(env_name)
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    Q = np.zeros((n_states, n_actions))
    rewards_all = []

    for ep in range(episodes):
        state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
        total_reward = 0
        for step in range(max_steps):
            if random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            out = env.step(action)
            if len(out) == 5:
                next_state, reward, terminated, truncated, _ = out
                done = terminated or truncated
            else:
                next_state, reward, done, _ = out

            Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            total_reward += reward
            if done:
                break

        rewards_all.append(total_reward)
        epsilon = max(min_epsilon, epsilon * decay)

        if (ep+1) % 2000 == 0:
            avg = np.mean(rewards_all[-2000:])
            print(f"Episode {ep+1}/{episodes} - Avg reward last 2000: {avg:.3f} - Epsilon: {epsilon:.4f}")

    return Q, rewards_all, env

def evaluate_policy(env, Q, episodes=100):
    total_rewards = 0
    successes = 0
    for _ in range(episodes):
        state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done and steps < 200:
            action = int(np.argmax(Q[state]))
            out = env.step(action)
            if len(out) == 5:
                state, reward, terminated, truncated, _ = out
                done = terminated or truncated
            else:
                state, reward, done, _ = out
            ep_reward += reward
            steps += 1
        total_rewards += ep_reward
        if ep_reward >= 20:  # heuristic: successful if it delivered taxi (positive cumulative)
            successes += 1

    print(f"Avg reward over {episodes} episodes: {total_rewards/episodes:.3f}, successes: {successes}/{episodes}")

if __name__ == "__main__":
    Q, rewards, env = q_learning_taxi(episodes=20000)
    print("\nEvaluating learned policy:")
    evaluate_policy(env, Q, episodes=200)
    print("\nSample rollout (rendered):")
    state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
    env.render()
    done = False
    steps = 0
    while not done and steps < 200:
        action = int(np.argmax(Q[state]))
        out = env.step(action)
        if len(out) == 5:
            state, reward, terminated, truncated, _ = out
            done = terminated or truncated
        else:
            state, reward, done, _ = out
        env.render()
        steps += 1
np.save("qtable_taxi.npy", Q)


Episode 2000/20000 - Avg reward last 2000: -191.070 - Epsilon: 0.3678
Episode 4000/20000 - Avg reward last 2000: -8.142 - Epsilon: 0.1353
Episode 6000/20000 - Avg reward last 2000: 3.137 - Epsilon: 0.0497
Episode 8000/20000 - Avg reward last 2000: 6.418 - Epsilon: 0.0183
Episode 10000/20000 - Avg reward last 2000: 7.255 - Epsilon: 0.0100
Episode 12000/20000 - Avg reward last 2000: 7.540 - Epsilon: 0.0100
Episode 14000/20000 - Avg reward last 2000: 7.276 - Epsilon: 0.0100
Episode 16000/20000 - Avg reward last 2000: 7.561 - Epsilon: 0.0100
Episode 18000/20000 - Avg reward last 2000: 7.529 - Epsilon: 0.0100
Episode 20000/20000 - Avg reward last 2000: 7.439 - Epsilon: 0.0100

Evaluating learned policy:
Avg reward over 200 episodes: 7.950, successes: 0/200

Sample rollout (rendered):


In [2]:
"""
load_taxi.py
Loads trained Q-table for Taxi-v3 and renders a rollout.
Requirements: gym, numpy
Run: python load_taxi.py
"""

import gymnasium as gym
import numpy as np

def run_taxi(qtable_file="qtable_taxi.npy", episodes=5, max_steps=200):
    # Load Q-table
    Q = np.load("qtable_taxi.npy")
    
    # Create environment
    env = gym.make("Taxi-v3", render_mode="human")

    for ep in range(episodes):
        print(f"\nEpisode {ep+1}")
        state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
        done = False
        total_reward = 0
        steps = 0

        while not done and steps < max_steps:
            action = int(np.argmax(Q[state]))  # greedy action
            out = env.step(action)

            if len(out) == 5:
                state, reward, terminated, truncated, _ = out
                done = terminated or truncated
            else:
                state, reward, done, _ = out

            total_reward += reward
            steps += 1

        print(f"Total reward: {total_reward}")
    
    env.close()

if __name__ == "__main__":
    run_taxi()



Episode 1
Total reward: 9

Episode 2
Total reward: 8

Episode 3
Total reward: 5

Episode 4
Total reward: 7

Episode 5
Total reward: 5
