In [1]:
import tensorflow as tf
import gym
import numpy as np

#### Load the environment

In [2]:
env = gym.make("FrozenLake-v0")

[2017-05-06 11:52:00,643] Making new env: FrozenLake-v0


#### Implement Q-Learning algorithm w/ Table

In [55]:
# Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# basically a table of dimensions, no of positions x no of actions at pos


# set learning parameters
lr = .86 # learning rate
y = 0.99 # discount rate
num_episodes =5000

# create lists to contain total rewards and steps per episode
rList = []

# Each episode is a runthrough the environment till death, goal or timeout
for i in range(num_episodes):
    # reset env and get new obvs
    s = env.reset()
    rAll = 0
    d = False # checks for done, a bool returned by gym indicating end
    j = 0 # timeout, in case ai cant end in 99 turns
    # The Q-Table learning algo
    while j<99:
        j+=1
        
        # choose an action by greedily (with noise) picking from Q table
        # returns the index of the max of each action from eachs state plus 
        # a random number to promote exploration
        # 1./i+1 is to reduce the weightage of the randomness as time progresses
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        
        #get new state and reward from env
        s1,r,d,_ = env.step(a)
        
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r+y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d:
            break
    rList.append(rAll)

In [56]:
print("Score over time:" + str(sum(rList)/num_episodes))

Score over time:0.5272


In [57]:
print("Final Q-Table Values")
print(Q)

Final Q-Table Values
[[  2.75558732e-02   6.87263522e-03   5.97344581e-01   6.76031738e-03]
 [  2.31641809e-03   9.67764962e-04   8.72365485e-03   3.78000136e-01]
 [  8.38085275e-04   1.14664531e-02   0.00000000e+00   3.01163994e-01]
 [  1.21351643e-03   3.53888333e-04   1.58065464e-03   2.65490416e-01]
 [  6.39110323e-01   1.60215916e-03   1.10552049e-03   3.00215741e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  6.18937393e-05   6.82994412e-06   5.88951514e-02   6.88327984e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.20705721e-03   2.05773514e-04   1.23202889e-02   5.68448018e-01]
 [  0.00000000e+00   8.74693449e-01   1.81169702e-03   0.00000000e+00]
 [  8.91778278e-01   3.09690880e-04   6.96894514e-04   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   3.25721444e-04   9.76878664e-01   6

#### Q-Network approach, ie using neural networks

In [19]:
sratio=[]

In [39]:
tf.reset_default_graph() # no clue what this does

In [40]:
# These lines establish the feed-forward part of the network used to choose actions

inputs1 = tf.placeholder(shape=[1,16], dtype=tf.float32)
# the weights
W = tf.Variable(tf.random_uniform([16,4],0,0.01)) #16 and 4 coming from this specific environment
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)

#Below we obtain the loss by taking the sum of squares difference
# between the target and prediction Q values
nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32) # the target value to be compared with Qout
loss = tf.reduce_sum(tf.square(nextQ-Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel =trainer.minimize(loss)

In [41]:
init = tf.global_variables_initializer()

# Set learning parameters
y = .99
e = 0.5
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Network
        while j < 99:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]})
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a[0])
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = r + y*maxQ1
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ})
            rAll += r
            s = s1
            if d == True:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)

In [42]:
print ("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")
sratio.append(sum(rList)/num_episodes)

Percent of succesful episodes: 0.155%


In [43]:
print(sratio)

[0.399, 0.3535, 0.3475, 0.2535, 0.155]
