In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make("FrozenLake-v1", render_mode='rgb_array')

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate, discount_rate = 0.1, 0.99

exploration_rate = 1
min_exploration_rate, max_exploration_rate = 0.01, 1
exploration_decay_rate = 0.001

In [6]:
all_rewards = []

## Q Learning Algorithm

In [7]:
for episode in range(num_episodes):
    state, prob = env.reset()
    
    done = False
    curr_reward = 0
    
    for step in range(max_steps_per_episode):
        
        threshold = random.uniform(0,1)
        if threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, truncated, info = env.step(action)
        
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        curr_reward += reward
        
        if done == True:
            break
    
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    all_rewards.append(curr_reward)

  if not isinstance(terminated, (bool, np.bool8)):


In [8]:
q_table

array([[0.5283666 , 0.4792217 , 0.46683333, 0.47485813],
       [0.30251125, 0.34896486, 0.2099458 , 0.44675594],
       [0.37986138, 0.26628128, 0.27311175, 0.27303106],
       [0.20006334, 0.03512592, 0.00614511, 0.0341832 ],
       [0.54521122, 0.29365709, 0.33248363, 0.39763496],
       [0.        , 0.        , 0.        , 0.        ],
       [0.14050338, 0.08725987, 0.28428147, 0.1342554 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.50928395, 0.46810125, 0.4240631 , 0.57836859],
       [0.42457479, 0.63270431, 0.53588603, 0.49747384],
       [0.58394311, 0.36508134, 0.33216363, 0.32553876],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.44471299, 0.4040933 , 0.7328606 , 0.47356914],
       [0.71658756, 0.82083544, 0.75659567, 0.75133243],
       [0.        , 0.        , 0.        , 0.        ]])

In [9]:
count = 1
rewards_per_1000_episodes = np.split(np.array(all_rewards), num_episodes/1000)

print("Average reward per 1000 episodes")
for r in rewards_per_1000_episodes:
    print(count*1000, ':', np.average(r))
    count+=1

Average reward per 1000 episodes
1000 : 0.027
2000 : 0.182
3000 : 0.43
4000 : 0.57
5000 : 0.632
6000 : 0.653
7000 : 0.66
8000 : 0.676
9000 : 0.705
10000 : 0.668


In [11]:
np.save(r"E:\Projects\Q-Learning-Algorithm\q_table", q_table)

## Gameplay

In [1]:
import gym
import numpy as np
import time
from gym.wrappers import RecordVideo
from IPython.display import clear_output

In [16]:
q_table = np.load(r"E:\Projects\Q-Learning-Algorithm\q_table.npy")
max_steps_per_episode = 100
record_mode = False

In [20]:
gameplay_env = gym.make("FrozenLake-v1", render_mode='rgb_array' if record_mode else 'human')
if record_mode:
    gameplay_env = RecordVideo(gameplay_env, video_folder="video", name_prefix="q_learning_algorithm")

In [21]:
if record_mode:
    gameplay_env.reset()
    gameplay_env.start_video_recorder()

for episode in range(3):
    state, prob = gameplay_env.reset()
    done=False
    
    print("Episode:", episode+1, "\n\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        if not record_mode:
            gameplay_env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state, :])
        new_state, reward, done, truncated, info = gameplay_env.step(action)
        
        if done:
            clear_output(wait=True)
            if not record_mode:
                gameplay_env.render()
            
            if reward == 1:
                print("Goal Reached!!!")
                time.sleep(3)
            else:
                print("Game Over!!! Try Again!!!")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state=new_state
gameplay_env.close()

Goal Reached!!!
