In [10]:
import gymnasium as gym
import numpy as np
import random

In [15]:
#setting up in the custom maze enviorment
desc=["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]

env = gym.make('FrozenLake-v1', desc=desc, map_name="5x4", is_slippery=False, render_mode="human") 
observation, info = env.reset()

In [16]:
# Define parameters
gamma = 0.5  # Discount factor
alpha = 0.9  # Learning rate
epsilon = 0.25  # Epsilon-greedy parameter
num_episodes = 1000

#this will show how many times the elf coompleted the maze and how many times it hit the wall and lake
completion_num = 0
into_wall = 0
into_lake = 0

In [17]:
# Initialize Q-values
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# SARSA algorithm
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    steps = []
    
    # Choose action using epsilon-greedy policy
    if np.random.rand() < epsilon:
        action = env.action_space.sample()  # Random action
    else:
        action = np.argmax(Q[state])  # Greedy action

    while not done:
        # Take action and observe next state and reward
        steps.append(action)
        step = env.step(action)
        next_state = step[0]
        reward = step[1]
        done = step[2]
        
        if reward == 0 and done:
            print("Hit lake at episode:"+ str(episode + 1))
            reward = -1
            into_lake += 1
        elif next_state == state:
            print("Ran into wall")
            reward = -1
            into_wall += 1

        # Choose next action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            next_action = env.action_space.sample()  # Random action
        else:
            next_action = np.argmax(Q[next_state])  # Greedy action
        
        # Update Q-value using SARSA update rule
        Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
        
        if reward == 1:
            print("Hit gift! At Episode: " + str(episode + 1))
            print(Q)
            print("Steps: ")
            print(steps)
            completion_num += 1
        
        # Move to next state and action
        state = next_state
        action = next_action


Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Hit lake :C1
Ran into wall
Ran into wall
Ran into wall
Hit lake :C2
Ran into wall
Ran into wall
Ran into wall
Hit lake :C3
Hit lake :C4
Hit lake :C5
Hit lake :C6
Ran into wall
Ran into wall
Ran into wall
Hit lake :C7
Hit lake :C8
Hit lake :C9
Hit lake :C10
Ran into wall
Ran into wall
Hit lake :C11
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Hit lake :C12
Ran into wall
Hit lake :C13
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Hit lake :C14
Ran into wall
Ran into wall
Ran into wall
Hit lake :C15
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Hit lake :C16
Hit lake :C17
Ran into wall
Ran into wall
Ran into wall
Hit lake :C18
Hit lake :C19
Ran into wall
Hit lake :C20
Ran into wall
Hit lake :C21
Ran into wall
Hit lake :C22
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
Ran into wall
H

KeyboardInterrupt: 

In [18]:
# Optimal policy
optimal_policy = np.argmax(Q, axis=1)

print("Optimal policy:")
print(optimal_policy)

#prints out how many time it hit the lake, wall, or has completed
print("# of episode completed:" + str(episode + 1))
print("# of times elf completed maze: " + str(completion_num))
print("# of times elf hit the wall: " + str(into_wall))
print("# of times elf went into the lake: " + str(into_lake))

Optimal policy:
[1 2 0 0 3 0 0 0 2 1 0 0 0 1 0 1 2 2 0 0]
# of episode completed:707
# of times elf completed maze: 222
# of times elf hit the wall: 789
# of times elf went into the lake: 484


In [19]:
env.close()