In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [7]:
env = gym.make("FrozenLake-v1")

In [8]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate, discount_rate = 0.1, 0.99

exploration_rate = 1
min_exploration_rate, max_exploration_rate = 0.01, 1
exploration_decay_rate = 0.001

In [9]:
all_rewards = []

## Q Learning Algorithm

In [10]:
for episode in range(num_episodes):
    state, prob = env.reset()
    
    done = False
    curr_reward = 0
    
    for step in range(max_steps_per_episode):
        
        threshold = random.uniform(0,1)
        if threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, truncated, info = env.step(action)
        
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        curr_reward += reward
        
        if done == True:
            break
    
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    all_rewards.append(curr_reward)

In [11]:
q_table

array([[0.57232096, 0.50748187, 0.54253345, 0.53010271],
       [0.2984488 , 0.2917733 , 0.31226968, 0.49767636],
       [0.36985337, 0.24482927, 0.22980683, 0.29546035],
       [0.08196227, 0.11344769, 0.04363203, 0.03158014],
       [0.5925146 , 0.41488256, 0.39575776, 0.30219424],
       [0.        , 0.        , 0.        , 0.        ],
       [0.31145995, 0.15869079, 0.20555795, 0.12560396],
       [0.        , 0.        , 0.        , 0.        ],
       [0.3837431 , 0.49790036, 0.46201833, 0.63821651],
       [0.37489187, 0.70789715, 0.40710794, 0.41304342],
       [0.63858444, 0.4390846 , 0.42650814, 0.30697554],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.41602301, 0.59526482, 0.76612086, 0.56963176],
       [0.71926259, 0.91505298, 0.75458546, 0.71618851],
       [0.        , 0.        , 0.        , 0.        ]])

In [12]:
count = 1
rewards_per_1000_episodes = np.split(np.array(all_rewards), num_episodes/1000)

print("Average reward per 1000 episodes")
for r in rewards_per_1000_episodes:
    print(count*1000, ':', np.average(r))
    count+=1

Average reward per 1000 episodes
1000 : 0.053
2000 : 0.214
3000 : 0.439
4000 : 0.544
5000 : 0.606
6000 : 0.616
7000 : 0.683
8000 : 0.67
9000 : 0.659
10000 : 0.682


## Gameplay

In [17]:
gameplay_env = gym.make("FrozenLake-v1", render_mode='human')
for episode in range(3):
    state, prob = gameplay_env.reset()
    done=False
    
    print("Episode:", episode+1, "\n\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        gameplay_env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state, :])
        new_state, reward, done, truncated, info = gameplay_env.step(action)
        
        if done:
            clear_output(wait=True)
            gameplay_env.render()
            
            if reward == 1:
                print("Goal Reached!!!")
                time.sleep(3)
            else:
                print("Game Over!!! Try Again!!!")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state=new_state

gameplay_env.close()

Goal Reached!!!


In [9]:
env.render()

  logger.warn(
