In [None]:
import numpy as np
import gym

# Create the environment
env = gym.make('Taxi-v3')

# Initialize Q-table with zeros
Q = np.zeros([env.observation_space.n, env.action_space.n])

# Set hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.6  # Discount factor
epsilon = 0.1  # Exploration rate

# Number of episodes
episodes = 1000

# Q-Learning algorithm
for _ in range(episodes):
    state = env.reset()
    done = False
    while not done:
        # Epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            action = np.argmax(Q[state])  # Exploitation

        next_state, reward, done, _ = env.step(action)

        # Q-value update
        old_q_value = Q[state, action]
        next_max = np.max(Q[next_state])
        new_q_value = (1 - alpha) * old_q_value + alpha * (reward + gamma * next_max)
        Q[state, action] = new_q_value

        state = next_state

# Print the learned Q-table
print("Learned Q-table:")
print(Q)

# Evaluate the learned policy
total_rewards = 0
episodes = 100
for _ in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = np.argmax(Q[state])
        state, reward, done, _ = env.step(action)
        total_rewards += reward

# Average reward over episodes
average_reward = total_rewards / episodes
print("Average Reward:", average_reward)
