In [1]:
import gymnasium as gym
import numpy as np
import random

In [2]:
#setting up in the custom maze enviorment
desc=["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]

env = gym.make('FrozenLake-v1', desc=desc, map_name="5x4", is_slippery=False, render_mode="human") 
observation, info = env.reset()

In [3]:
# Define parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Epsilon-greedy parameter
num_episodes = 10000

#this will show how many times the elf coompleted the maze and how many times it hit the wall and lake
completion_num = 0
into_wall = 0
into_lake = 0

In [4]:
# Initialize Q-values
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# Monte Carlo Control
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    steps = []  # Renamed from episode_steps
    
    # Generate episode
    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            ind = 0
            maxIndices = []
            maxNum = -1
            for num in Q[state]:
                if num > maxNum:
                    maxIndices = [ind]
                    maxNum = num
                elif num == maxNum:
                    maxIndices.append(ind)
                ind += 1
                
            maxInd = -1
            if len(maxIndices) >= 1:
                maxInd = maxIndices[random.randint(0, len(maxIndices) - 1)]
                
            if maxInd == -1:
                maxInd = env.action_space.sample()
            action = maxInd  # Greedy action
        
        # Take action and observe next state and reward
        steps.append((state, action))  # Appending state-action pair to steps
        step = env.step(action)
        next_state = step[0]
        reward = step[1]
        done = step[2]
        
        if reward == 0 and done:
            print("Hit lake at episode: "+ str(episode + 1) +" :(")
            reward = -1
            into_wall += 1
        elif next_state == state:
            print("ran into wall")
            reward = -1
            into_lake += 1
        
        # Move to next state
        state = next_state
    
    # Calculate return for the episode
    returns = 0
    for t in reversed(range(len(steps))):
        state, action = steps[t]
        returns = gamma * returns + reward
    
    # Update Q-values at the end of the episode
    for t in reversed(range(len(steps))):
        state, action = steps[t]
        Q[state, action] += alpha * (returns - Q[state, action])

    if reward == 1:
        print("Hit gift! At Episode: "+str(episode +1))
        print(Q)
        print("Steps: ")
        print(steps)
        completion_num += 1

ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
Hit lake at episode: 1 :(
Hit lake at episode: 2 :(
ran into wall
ran into wall
ran into wall
Hit lake at episode: 3 :(
ran into wall
ran into wall
ran into wall
ran into wall
Hit lake at episode: 4 :(
Hit lake at episode: 5 :(
Hit lake at episode: 6 :(
ran into wall
Hit lake at episode: 7 :(
ran into wall
Hit lake at episode: 8 :(
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
Hit lake at episode: 9 :(
ran into wall
ran into wall
ran into wall
ran into wall
Hit lake at episode: 10 :(
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
ran into wall
Hit lake at episode: 11 :(
ran into wall
ran into wall
ran into wall
Hit lake at episode: 12 :(
ran into wall
Hit lake at episode: 13 :(
ran into wall


KeyboardInterrupt: 

In [13]:
# Optimal policy
optimal_policy = np.argmax(Q, axis=1)

print("Optimal policy:")
print(optimal_policy)

#prints out how many time it hit the lake, wall, or has completed
print("# of episode completed:" + str(episode + 1))
print("# of times elf completed maze: " + str(completion_num))
print("# of times elf hit the wall: " + str(into_lake)) #I accidental mixed up the two variables
print("# of times elf went into the lake: " + str(into_wall))
print("Completion rate: %" + str(completion_num/(episode + 1)*100))

Optimal policy:
[1 3 0 1 1 0 0 0 2 2 2 1 0 1 0 1 1 2 0 1]
# of episode completed:3580
# of times elf completed maze: 2063
# of times elf hit the wall: 10960
# of times elf went into the lake: 1516
Completion rate: %57.625698324022345


In [14]:
env.close()