In [1]:
#https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0
#https://keon.io/deep-q-learning/

import gym
import numpy as np

More info about the Frozen Lake in the documentation: 
https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py

In [2]:
env = gym.make('FrozenLake-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [3]:
# Initialize a table representing each state-action pair
# For a 4x4 board, with 4 directions, we have a Q table with dimensions 16x4
# We can use the gym env attributes observation_space, and action_space for this
qTable = np.zeros([env.observation_space.n,env.action_space.n])

# Learning parameters used in the blog post
lr = .8
discount = .95
num_episodes = 100000
# Epsilon is a value that is multiplied to the noise. 
# We start with a big epsilon for exploration and gradually decrease it overtime
epsilon = 1
epsilon_min = 0.001
epsilon_decay = 0.999

# Create a list that stores the episode_reward and step count at each episode
reward_list = []
step_list = []

for episode in range(num_episodes):
    # Reset the env
    state = env.reset() # State is the current board position (from 0 to env.observation_space.n) 
    step = 0
    episode_reward = 0
    # We limit the amount of steps to avoid any infinite or long episodes
    while step < 99:
        # Generate gaussian noise to add to the action
        noise = np.random.randn(1,env.action_space.n)*epsilon
        action = np.argmax(qTable[state,:] + noise)
        
        # Pass the action to the environment
        state_new, reward, done,_ = env.step(action)
        
        # Lets add a negative reward and see if we can get a better score
#         if done and reward == 0:
#             reward = -0.05
#             reward = -0.01
        
        # Update the Q table
        qTable[state,action] = qTable[state,action] + lr*(reward + discount*np.amax(qTable[state_new,:]) - qTable[state,action]) 
        # Decay epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
            
        # Iterate variables and check if the episode is done (in terminal state)
        episode_reward += reward
        step += 1       
        state = state_new
        if done:
            break
    
    # Add the episode rewards and steps to a list so we can review performance later on
    reward_list.append(episode_reward)
    step_list.append(step)
    
    if episode % (num_episodes/10) == 0:
        print("Episode {}/{}".format(episode, num_episodes))
        
        

Episode 0/100000
Episode 10000/100000
Episode 20000/100000
Episode 30000/100000
Episode 40000/100000
Episode 50000/100000
Episode 60000/100000
Episode 70000/100000
Episode 80000/100000
Episode 90000/100000


In [4]:
print ("Average score over time: {}".format(str(sum(reward_list)/num_episodes)))
print ("Average steps over time: {}".format(str(sum(step_list)/num_episodes)))

Average score over time: 0.71105
Average steps over time: 41.22192


In [53]:
# actual Q table values
qTable

array([[1.81041622e-01, 4.30624456e-03, 4.10854747e-03, 3.51862573e-03],
       [5.64516558e-04, 8.34065051e-04, 4.17687711e-04, 1.61594004e-01],
       [2.76030073e-03, 1.88322089e-01, 2.51539734e-03, 2.83352594e-03],
       [1.92825442e-03, 2.01972636e-03, 3.38970586e-05, 8.80076790e-02],
       [2.74918513e-01, 1.63705118e-05, 1.14413177e-03, 4.32030413e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.34711082e-05, 5.38488410e-04, 1.33203177e-01, 6.57135265e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.11809731e-03, 2.27595966e-04, 6.79438019e-04, 4.48511616e-01],
       [6.49834644e-04, 6.18881370e-01, 2.08159573e-03, 1.72082868e-04],
       [8.63355540e-01, 3.40536824e-04, 9.17668819e-04, 6.83657207e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.70921249e-02, 2.12926464e-02, 4.19775170e

In [51]:
# Now lets test each reward system on 100 episodes
# Remember to remove any noise/randomness when doing actual testing

num_episodes = 100

# Create a list that stores the episode_reward at each episode
reward_list = []
step_list = []

for episode in range(num_episodes):
    # Reset the env
    state = env.reset() # State is the current board position (from 0 to env.observation_space.n) 
    step = 0
    episode_reward = 0
    # We limit the amount of steps to avoid any infinite or long episodes
    while step < 99:
       # env.render()
        # Get the best action without any noise. We don't want any randomness now that we are testing.
        action = np.argmax(qTable[state,:])
        
        # Pass the action to the environment
        state_new, reward, done,_ = env.step(action)
            
        # Iterate variables and check if the episode is done (in terminal state)
        episode_reward += reward
        step += 1       
        state = state_new
        if done == True:
            #env.render()
            break
    
    reward_list.append(episode_reward)
    step_list.append(step)
        

In [52]:
# No negative reward
print ("Average score over time: {}".format(str(sum(reward_list)/num_episodes)))
print ("Average steps over time: {}".format(str(sum(step_list)/num_episodes)))

# Average score: 0.74
# Average steps: 38.04

Average score over time: 0.74
Average steps over time: 38.04


In [48]:
# Negative reward of -0.05
print ("Average score over time: {}".format(str(sum(reward_list)/num_episodes)))
print ("Average steps over time: {}".format(str(sum(step_list)/num_episodes)))

# Average score: 0.0
# Average steps: 99.0

Average score over time: 0.0
Average steps over time: 99.0


In [44]:
# Negative reward of -0.01
print ("Average score over time: {}".format(str(sum(reward_list)/num_episodes)))
print ("Average steps over time: {}".format(str(sum(step_list)/num_episodes)))

# Average score: 0.54
# Average steps: 62.68

Average score over time: 0.54
Average steps over time: 62.68
