Following tutorial here: 
https://www.youtube.com/watch?v=QK_PP_2KgGE&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv&index=8

In [2]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make("FrozenLake-v0")

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01

exploration_decay_rate = 0.001

In [10]:
rewards_all_episodes = []

# Q-Learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploration -exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate: 
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        # Update Q-table for Q(s,a)
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
            
        state = new_state
        rewards_current_episode += reward
        
        if done == True: 
            break
            
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1000)
count = 1000
print("********** Average  reward per thousand episodes **********\n")

for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r / 1000)))
    count += 1000
    
# Print updated Q-table
print("\n\n********** Q-table **********\n")
print(q_table)
        

********** Average  reward per thousand episodes **********

1000 :  0.04000000000000003
2000 :  0.21200000000000016
3000 :  0.4070000000000003
4000 :  0.5750000000000004
5000 :  0.6110000000000004
6000 :  0.6410000000000005
7000 :  0.6470000000000005
8000 :  0.6960000000000005
9000 :  0.6930000000000005
10000 :  0.7090000000000005


********** Q-table **********

[[0.58024171 0.54229881 0.53260582 0.53139742]
 [0.3371445  0.26884291 0.27842931 0.52097642]
 [0.41808381 0.43173947 0.39995505 0.48437107]
 [0.36820324 0.30313074 0.25543035 0.46334875]
 [0.5999856  0.39049707 0.37160209 0.42678134]
 [0.         0.         0.         0.        ]
 [0.23352799 0.11040702 0.49197896 0.09404461]
 [0.         0.         0.         0.        ]
 [0.37630744 0.39149089 0.37812894 0.65489582]
 [0.46439661 0.72593727 0.46364658 0.38923149]
 [0.7203016  0.39882864 0.47009292 0.26180184]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.42688433 0.6611058

Now, we let the agent play the game. 

In [11]:
for episode in range(3):
    state = env.reset()
    done = False
    print("***** EPISODE ", episode + 1, " *****\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait = True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done: 
            clear_output(wait = True)
            env.render()
            if reward == 1: 
                print("*****You reached your goal!*****")
                time.sleep(3)
            else:
                print("*****You fall through a hole!*****")
                time.sleep(3)
            clear_output(wait = True)
            break
            
        state = new_state
        
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
*****You reached your goal!*****
