In [1]:
# Import libraries
import numpy as np
import gym

In [3]:
# Initialize Parameters

# Hyperparameters
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay = 0.001
alpha = 0.1  # Learning rate
gamma = 1.0  # Discount factor
episodes = 2000
max_actions = 100  # Maximum steps per episode

# Initialize environment
env = gym.make("FrozenLake-v1", is_slippery=False)  # Deterministic for simplicity
num_states = env.observation_space.n
num_actions = env.action_space.n
q_table = np.zeros((num_states, num_actions))

In [4]:
# Policy and Update Function

# Epsilon-greedy policy
def epsilon_greedy(state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Explore
    return np.argmax(q_table[state])  # Exploit

# Q-learning update rule
def q_learning_update(state, action, reward, next_state):
    old_value = q_table[state, action]
    next_max = np.max(q_table[next_state])
    q_table[state, action] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

In [5]:
# Training Loop

episode_returns = []

global epsilon  # Required to modify global epsilon

for episode in range(episodes):
    state, _ = env.reset()
    total_reward = 0
    for _ in range(max_actions):
        action = epsilon_greedy(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        q_learning_update(state, action, reward, next_state)
        state = next_state
        total_reward += reward
        if terminated:
            break
    episode_returns.append(total_reward)
    epsilon = max(min_epsilon, epsilon * (1 - epsilon_decay))

AttributeError: module 'numpy' has no attribute 'bool8'

In [6]:
# Policy Extraction

policy = {state: np.argmax(q_table[state]) for state in range(num_states)}
print("Learned Policy:")
print(policy)


Learned Policy:
{0: np.int64(0), 1: np.int64(0), 2: np.int64(0), 3: np.int64(0), 4: np.int64(0), 5: np.int64(0), 6: np.int64(0), 7: np.int64(0), 8: np.int64(0), 9: np.int64(0), 10: np.int64(0), 11: np.int64(0), 12: np.int64(0), 13: np.int64(0), 14: np.int64(0), 15: np.int64(0)}


In [7]:
# Test Policy

frames = []
state, _ = env.reset(seed=42)
frames.append(env.render())
episode_total_reward = 0

for _ in range(16):
    action = policy[state]
    state, reward, terminated, truncated, _ = env.step(action)
    episode_total_reward += reward
    frames.append(env.render())
    if terminated:
        break

env.close()
print(f"Total Reward: {episode_total_reward}")

Total Reward: 0.0


  logger.warn(
