In [23]:
import numpy as np 
import gym 
import random
import time
from IPython.display import clear_output

In [24]:
env = gym.make("FrozenLake-v0")     # We create an environment of Frozen lake game (special keyword) you can sample the environment, move the agent, collect rewards etc from this object


In [25]:
action_space_size = env.action_space.n      # Our action spaces are our column labels up top 
state_space_size = env.observation_space.n      # Our state space are our row labels

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [26]:
num_episodes = 10000         # How many "rounds" or episodes do you want the agent to play?
max_steps_per_episode = 100         # How many steps will we allow the agent to play before we stop it (& reward 0 pts)

learning_rate = 0.1         # our "alpha": How much should new observations contribute to overall learning?
discount_rate = 0.99

exploration_rate = 1         # Our "epsilon": where does the threshold for exploration/exploit initialize
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.0001           # How much do we make exploitation more probable over time? test against 0.01



In [27]:
rewards_all_episodes = []       # Holds the rewards we will get from each episode (so we can view our our game scores change over time)

for episode in range(num_episodes):
    state = env.reset()

    done = False            # Resets "done" from the previous episode
    rewards_current_episode = 0         # what were the total rewards for this past episode?

    for step in range(max_steps_per_episode):           #iterates through our maximum allowed steps

        # Exploration vs exploitation?
        exploration_rate_threshold = random.uniform(0,1)            # Generates a random number tells us if it will explore or exploit
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])            # if the random number is larger than our threshold sets the action to the one that promises the largest expected return

        else:
            action = env.action_space.sample()          # sets our action to a random one (i.e. sample state space)
        
        new_state, reward, done, infor = env.step(action)           # we take the action as determined by action

        # Update Q-table for A(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))            # So we use the past state and newly decided action into the bellman equation. Notice that the "max" term takes the max possible return only from the actions right after this one. It can therefore be applies recursively.

        state = new_state           # Update the state 
        rewards_current_episode += reward           # Store away the rewards for this step sequentially in the log of all rewards for the episode

        if done == True:            # If you're done, jump up and restart the episode (or end it if you got to the end of num_of_episodes) else, continue making steps
            break
        
    
    # Exploration rate decay
    exploration_rate = min_exploration_rate +(max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)       # multiply our decay factor by a larger number as episodes increase

    rewards_all_episodes.append(rewards_current_episode)

rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)     # Seperates out our rewards roughly into groups representing the rewards from 1000 episodes
count = 1000
print("*******Average Reward per thousand episodes******** \n")
for r in rewards_per_thousand_episodes:
    print(count, ":", str(sum(r/1000)))
    count += 1000               # Adds up the average of the next 1000 episodes

# Print updates Q-table
print("\n\n ******* Q-Table ********\n")
print(q_table)





*******Average Reward per thousand episodes******** 

1000 : 0.011000000000000003
2000 : 0.011000000000000003
3000 : 0.027000000000000017
4000 : 0.027000000000000017
5000 : 0.03800000000000003
6000 : 0.058000000000000045
7000 : 0.08400000000000006
8000 : 0.08100000000000006
9000 : 0.06700000000000005
10000 : 0.10600000000000008


 ******* Q-Table ********

[[0.58468075 0.54919178 0.54631778 0.5574945 ]
 [0.31269025 0.34736625 0.27196852 0.50609183]
 [0.44563606 0.43230446 0.44537003 0.47369003]
 [0.28026528 0.34774594 0.29096202 0.44334767]
 [0.60497032 0.31925645 0.41164321 0.29117521]
 [0.         0.         0.         0.        ]
 [0.33331677 0.30735129 0.28145161 0.15302066]
 [0.         0.         0.         0.        ]
 [0.33742983 0.48966997 0.37379552 0.63441513]
 [0.5226089  0.67380309 0.45911462 0.4692381 ]
 [0.64586059 0.52389945 0.31684041 0.32235661]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.48112697 0.65650214 0.8145

In [32]:
# Visualize the robot playing the game

for episode in range(3):         # Watch 3 episodes
    state = env.reset()
    done = False
    print("***** Episode", episode + 1, "*****\n\n\n\n")
    time.sleep(1)           # Just so that we can read the title

    for step in range(max_steps_per_episode):
        clear_output(wait = True)           # clear_output is an ipython command that clears the output of the cell and wait = True means it waits to clear the display until it gets another output
        env.render()            # Render the state of our agent
        time.sleep(0.3)

        action = np.argmax(q_table[state,:])            # Set our action to be the one that maximizes our expected return
        new_state, reward, done, info = env.step(action)    

        if done:
            clear_output(wait = True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(0.3)

            clear_output(wait = True)
            break

        state = new_state

env.close()
        

(Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
