In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v0")
# S--> Agent's starting point safe  -- 0 reward
# F--> Frozen surface safe -- 0 reward
# H--> Hole game over -- 0 reward
# G--> Goal game over -- 1 reward

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Training Phase

## Algorithm Parameters

In [4]:
num_episodes = 10000 ## Number of episodes we want our agent to play
## during training
max_steps_per_episode = 100 ## If the agent hasn't reached the frisbie
## or fell into the water by the 100th step then the episode will terminate
## with the agent recieving zero points


learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.1
exploration_decay_rate = 0.0001

In [5]:
rewards_all_episodes = [] ## To see how our game scores change over time

# Q-learning algorithm

for episode in range(num_episodes):
    state = env.reset() ## for each episode we will reset the state of the environment back to the starting state
    
    done = False ## Used to check if our episode is finished or not
    rewards_current_episode = 0 ## to record the reward for the current episode
    
    for step in range(max_steps_per_episode): ## for loop for each timestep within an episode
        #Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) ## agent chooses action with highest q-value for the current state
        else:
            action = env.action_space.sample() ## agent will explore the environment and sample an action randomly
        new_state, reward, done, info = env.step(action)
        #We take the action by calling step on the env-object and pass our action to it
        #step() returns a tuple containing the new_state, the reward, whether or not the action ended the episode(done)
        #and some diagnostic information regarding our environment
        
        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        rewards_current_episode += reward
        
        if done ==True:
            break
            
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    rewards_all_episodes.append(rewards_current_episode)
    
#Calculate and print average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count,": ",str(sum(r/1000)))
    count+=1000
#Print updates Q-table
print("\n\n********Q-table********\n")
print(q_table)

********Average reward per thousand episodes********

1000 :  0.015000000000000006
2000 :  0.01800000000000001
3000 :  0.023000000000000013
4000 :  0.027000000000000017
5000 :  0.03400000000000002
6000 :  0.037000000000000026
7000 :  0.05500000000000004
8000 :  0.047000000000000035
9000 :  0.07100000000000005
10000 :  0.08800000000000006


********Q-table********

[[0.59889362 0.56660652 0.57093472 0.56282814]
 [0.30907865 0.29539145 0.26325525 0.55094983]
 [0.47413043 0.46500506 0.45524475 0.50342305]
 [0.36019994 0.29257958 0.27772255 0.48093252]
 [0.62569061 0.49134817 0.42894338 0.33212295]
 [0.         0.         0.         0.        ]
 [0.31330604 0.25184039 0.46192399 0.09954726]
 [0.         0.         0.         0.        ]
 [0.48078771 0.45987566 0.45764778 0.65073592]
 [0.3722082  0.70783842 0.53331754 0.40806695]
 [0.69767085 0.40142369 0.44400852 0.26190211]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.44606721 0.5603362

# Testing Phase

In [6]:
for episode in range(3): ## We'll watch the agent play 3 episodes in total
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render() ## render the current state of the agent to visually see the game grid
        time.sleep(0.3)
        
        action= np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            clear_output(wait=True)
            break
            
        state = new_state
env.close()

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
****You fell through a hole!****
