In [None]:
import numpy as np
import gym
import random 
import time
from IPython.display import clear_output

In [None]:
env = gym.make("FrozenLake-v1")

In [None]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)
print(f"State space size: {state_space_size}\nAction space size: {action_space_size}")
print(f"Shape of the table {q_table.shape}")

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
State space size: 16
Action space size: 4
Shape of the table (16, 4)


In [None]:
# The number of episodes that the agent will go through
num_episodes = 10000
# The maximum number of steps that the agent can do. If 100 is reached the episode ends.
max_steps_per_episode = 100

# The learning rate determines how much we care about previous information. If
# the learning rate is closer to 1 then we discard previous info, if closer to
# 0 then previous info reeally matters for the next action.
learning_rate = 0.1
# The discount rate also works the same as the learning_rate. But the difference
# is that it cares about action-state pairs from the future. We use it in case
# there is a case of continuity (Infinite number of episodes).
discount_rate = 0.99

# The starting exploration rate. 1 means that for the first episode the chance
# of exploration (aka random moves) is 100%.
exploration_rate = 1
# The maximum value that the exploration rate can go up to.
max_exploration_rate = 1
# The minimum value that the exploration rate can go up to.
min_exploration_rate = 0.01
# Decrements the exploration_rate by 0.001 every episode.
exploration_decay_rate = 0.001

In [None]:
# Track the rewards that we are getting form every single episode
rewards_all_episodes = []

# Q-Learning Algorithm
for episode in range(num_episodes):
  state = env.reset()

  done = False
  rewards_current_episode = 0

  for step in range(max_steps_per_episode):

    # Exploration-exploitation trade-off
    exploration_rate_threshold = random.uniform(0, 1)
    if exploration_rate_threshold > exploration_rate:
      action = np.argmax(q_table[state, :])
    else:
      action = env.action_space.sample()

    new_state, reward, done, info = env.step(action)

    # Update Q-table for Q(s, a)
    q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
        learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

    state = new_state
    rewards_current_episode += reward

    if done:
      break
    
  # Exploration rate decay
  exploration_rate = min_exploration_rate + \
      (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

  rewards_all_episodes.append(rewards_current_episode)


rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1000)
count = 1000
print("*******Average reward per thousand episodes***********\n")
for r in rewards_per_thousand_episodes:
  print(count, ": ", str(sum(r/1000)))
  count += 1000

# Print updated Q-table
print("\n\n*****Q-table*****\n")
print(q_table)

*******Average reward per thousand episodes***********

1000 :  0.05300000000000004
2000 :  0.18700000000000014
3000 :  0.4130000000000003
4000 :  0.5470000000000004
5000 :  0.6450000000000005
6000 :  0.6410000000000005
7000 :  0.6720000000000005
8000 :  0.6810000000000005
9000 :  0.7060000000000005
10000 :  0.6670000000000005


*****Q-table*****

[[0.5302898  0.51087363 0.50506937 0.50467419]
 [0.33095317 0.29963077 0.35320914 0.49023414]
 [0.38732826 0.35298849 0.34336597 0.46670015]
 [0.2476995  0.19114293 0.25001845 0.45680451]
 [0.54952234 0.34711968 0.36323585 0.2975415 ]
 [0.         0.         0.         0.        ]
 [0.17410256 0.16013219 0.36341445 0.12992126]
 [0.         0.         0.         0.        ]
 [0.37161261 0.30561817 0.38343978 0.57755412]
 [0.41572093 0.61892804 0.45513621 0.48492452]
 [0.49605596 0.36532734 0.43473865 0.33323044]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.33259949 0.52992177 0.7260671  0.29

In [None]:
# Test the agent

for episode in range(3):
  state = env.reset()
  done = False
  print("********EPISODE", episode + 1, "************\n\n\n\n")
  time.sleep(1)

  for step in range(max_steps_per_episode):
    clear_output(wait = True)
    env.render()
    time.sleep(0.3)

    action = np.argmax(q_table[state, :])
    new_state, reward, done, info = env.step(action)

    if done:
      clear_output(wait = True)
      env.render()
      if reward == 1:
        print("*****You reached the goal!*****")
        time.sleep(3)

      else:
        print("*******You fell into a hole******")
        time.sleep(3)

      clear_output(wait = True)
      break

    state = new_state

env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
*****You reached the goal!*****
