In [3]:
"""
frozenlake_qlearning.py
Q-learning for FrozenLake-v1
Requirements: gym, numpy
Run: python frozenlake_qlearning.py
"""

import gymnasium as gym
import numpy as np
import random

def q_learning_frozenlake(env_name="FrozenLake-v1", render_mode="rgb_array",is_slippery=False,
                          episodes=5000, max_steps=100, alpha=0.8, gamma=0.95,
                          epsilon=1.0, min_epsilon=0.01, decay=0.999):
    # Create env with desired slipperiness
    env = gym.make(env_name, is_slippery=is_slippery)
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    Q = np.zeros((n_states, n_actions))
    rewards_all = []

    for ep in range(episodes):
        state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
        total_rewards = 0
        for step in range(max_steps):
            # epsilon-greedy
            if random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            out = env.step(action)
            if len(out) == 5:  # gym >=0.26 returns (obs, reward, terminated, truncated, info)
                next_state, reward, terminated, truncated, _ = out
                done = terminated or truncated
            else:
                next_state, reward, done, _ = out

            # Q-learning update
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            total_rewards += reward

            if done:
                break

        # decay epsilon
        epsilon = max(min_epsilon, epsilon * decay)
        rewards_all.append(total_rewards)

        # logging (simple)
        if (ep+1) % 500 == 0:
            avg = np.mean(rewards_all[-500:])
            print(f"Episode {ep+1}/{episodes} - Avg reward last 500: {avg:.3f} - Epsilon: {epsilon:.3f}")

    return Q, rewards_all, env

def extract_policy(Q):
    return np.argmax(Q, axis=1)

def render_policy(env, policy):
    # mapping for FrozenLake: 0=Left,1=Down,2=Right,3=Up
    action_map = {0:'<', 1:'v', 2:'>', 3:'^'}
    size = int(np.sqrt(env.observation_space.n))
    grid = []
    for s in range(env.observation_space.n):
        grid.append(action_map[policy[s]])
    for row in range(size):
        print(' '.join(grid[row*size:(row+1)*size]))

if __name__ == "__main__":
    Q, rewards, env = q_learning_frozenlake(is_slippery=True, episodes=5000)
    policy = extract_policy(Q)
    print("\nLearned policy (symbols: < left, v down, > right, ^ up):")
    render_policy(env, policy)

    print("\nExample run using the learned policy (rendered):")
    state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
    env.render()
    done = False
    total = 0
    steps = 0
    while not done and steps < 100:
        action = int(policy[state])
        out = env.step(action)
        if len(out) == 5:
            state, reward, terminated, truncated, _ = out
            done = terminated or truncated
        else:
            state, reward, done, _ = out
        total += reward
        steps += 1
        env.render()
    print("Total reward from single policy run:", total)
np.save("qtable_frozenlake.npy", Q)


Episode 500/5000 - Avg reward last 500: 0.036 - Epsilon: 0.606
Episode 1000/5000 - Avg reward last 500: 0.030 - Epsilon: 0.368
Episode 1500/5000 - Avg reward last 500: 0.066 - Epsilon: 0.223
Episode 2000/5000 - Avg reward last 500: 0.120 - Epsilon: 0.135
Episode 2500/5000 - Avg reward last 500: 0.142 - Epsilon: 0.082
Episode 3000/5000 - Avg reward last 500: 0.270 - Epsilon: 0.050
Episode 3500/5000 - Avg reward last 500: 0.324 - Epsilon: 0.030
Episode 4000/5000 - Avg reward last 500: 0.370 - Epsilon: 0.018
Episode 4500/5000 - Avg reward last 500: 0.558 - Epsilon: 0.011
Episode 5000/5000 - Avg reward last 500: 0.484 - Epsilon: 0.010

Learned policy (symbols: < left, v down, > right, ^ up):
< ^ < ^
< < < <
^ v < <
< > > <

Example run using the learned policy (rendered):
Total reward from single policy run: 1.0


In [1]:
"""
load_frozenlake.py
Loads trained Q-table for FrozenLake and renders a rollout.
Requirements: gym, numpy
Run: python load_frozenlake.py
"""

import gymnasium as gym
import numpy as np

def run_frozenlake(qtable_file="qtable_frozenlake.npy", episodes=5, max_steps=100):
    # Load Q-table
    Q = np.load(qtable_file)
    
    # Create environment
    env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="human")

    for ep in range(episodes):
        print(f"\nEpisode {ep+1}")
        state = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
        done = False
        total_reward = 0
        steps = 0

        while not done and steps < max_steps:
            action = int(np.argmax(Q[state]))  # greedy action
            out = env.step(action)

            if len(out) == 5:
                state, reward, terminated, truncated, _ = out
                done = terminated or truncated
            else:
                state, reward, done, _ = out

            total_reward += reward
            steps += 1

        print(f"Total reward: {total_reward}")
    
    env.close()

if __name__ == "__main__":
    run_frozenlake()



Episode 1
Total reward: 0.0

Episode 2
Total reward: 1.0

Episode 3
Total reward: 1.0

Episode 4
Total reward: 1.0

Episode 5
Total reward: 0.0
