In [15]:
import gymnasium as gym
import matplotlib.pyplot as plt 
import time
import numpy as np
from collections import defaultdict
import random

Q = None

In [3]:
# Observation and action space 
env = gym.make("FrozenLake-v1")
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {}".format(obs_space))
print("The action space: {}".format(action_space))

The observation space: Discrete(16)
The action space: Discrete(4)


In [4]:
def Q_train(env, alpha=0.45, gamma=0.8, decay_rate=0.001, episodes=1000):

    # Q Table
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    for episode in range(episodes-1):
        state, _ = env.reset()
        epsilon = np.exp(-decay_rate*episode)
        terminated, truncated = False, False

        while not terminated and not truncated:
            # Explore
            if(random.uniform(0,1) < epsilon): action = env.action_space.sample()
            
            # Exploit
            else: action = np.argmax(Q[state,:])

            next_state, reward, terminated, truncated, info = env.step(action)
            Q[state,action] += alpha*(reward + gamma+(1-int(terminated))*np.max(Q[next_state,:]) - Q[state,action])
            
            state = next_state

    return Q

In [5]:
# Calculate success rate out of 1000 games
def success_rate(env, q, games = 1000):
    wins = 0
    for i in range(games):
        state, _ = env.reset()
        done = False
        rewards = 0
        while(not done):
            action = np.argmax(q[state, :])
            state, reward, terminated, truncated, info = env.step(action) 
            rewards += reward
            done = terminated or truncated

        if(rewards): wins += 1
        

    return wins/games

In [13]:
def grid_search(game_name="FrozenLake-v1"):
    
    env = gym.make("FrozenLake-v1")
    
    sr = {}
    tables = {}

    for alpha in np.arange(0.01, 0.99, 0.05):
        for gamma in np.arange(0.01, 0.99, 0.05):
            for decay_rate in np.arange(0.001, 0.01, 0.005):
                table = Q_train(env, alpha, gamma, decay_rate, 5000)
                parameters = (alpha,gamma,decay_rate)
                tables[parameters] = table
                sr[(alpha,gamma,decay_rate)] = success_rate(env,table,100)

    alpha, gamma, decay = parameters = max(list(sr.keys()), key=lambda x: sr[x])
    
    print("-- Best parameters -- ")
    print("Alpha: ", str(alpha))
    print("Gamma: ", str(gamma))
    print("decay: ", str(decay))
    print("Success rate: ", str(sr[parameters]))
    return parameters, tables[parameters]
    
parameters, Q = grid_search()

-- Best parameters -- 
Alpha:  0.11
Gamma:  0.16000000000000003
decay:  0.006
Success rate:  0.74


In [14]:
with open("frozenQTable.npy","wb") as f:
    np.save(f, Q)

In [16]:
with open("frozenQTable1000.npy","rb") as f:
    Q = np.load(f)

In [18]:
# watch trained agent
env = gym.make("FrozenLake-v1", render_mode='human')

state, _ = env.reset()
state = state
done = False
rewards = 0

while(not done):
    action = np.argmax(Q[state, :])
    
    state, reward, terminated, truncated, info = env.step(action)
    
    rewards += reward
    
    env.render()
    
    time.sleep(0.01)
    done = terminated or truncated

if(rewards): print("Yay !! You got the present")
else: print("You failed -_-")

time.sleep(3)
env.close()

Yay !! You got the present
