## Q-Table learning in FrozenLake

In [1]:
import gym
import numpy as np

from pprint import pprint

In [2]:
env = gym.make('FrozenLake-v0')

In [10]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
# pprint(Q)

In [13]:
observation = env.reset()
env.render()
print("\n----------------------------------\n")

for i in range(env.action_space.n) :
    action = i
    obsevation, reward, done, info = env.step(action)
    env.render()
    print('\naction: %d' %action)
    print('state: %d \ninfo: %.3f' %(observation, info['prob']))
    print("\n==================================================")
    
    observation = env.reset()


[41mS[0mFFF
FHFH
FFFH
HFFG

----------------------------------

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG

action: 0
state: 0 
info: 0.333

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

action: 1
state: 0 
info: 0.333

  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG

action: 2
state: 0 
info: 0.333

  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG

action: 3
state: 0 
info: 0.333



In [14]:
# lr --> learning rate
# y --> discounting factor

lr = 0.8
y = 0.95
num_episode = 2000

rewardList = []

In [15]:
for i in range(num_episode) :
    s = env.reset()
    rAll = 0
    d = False
    step = 0
    
    while step < 100 :
        step += 1
        a = np.argmax(Q[s,:]+np.random.randn(1, env.action_space.n)*(1./(i+1)))
        s1, r, d, info = env.step(a)
        
        Q[s,a] = Q[s,a]+lr*(r +y*np.max(Q[s1,:])-Q[s,a])
        
        rAll += r
        s = s1
        
        if d :
            break
            
    rewardList.append(rAll)

In [16]:
print("Score: ", str(sum(rewardList)/num_episode))

Score:  0.6185


In [17]:
pprint(np.round(Q,3))

array([[0.108, 0.007, 0.006, 0.012],
       [0.006, 0.005, 0.006, 0.121],
       [0.002, 0.   , 0.   , 0.168],
       [0.006, 0.002, 0.002, 0.103],
       [0.113, 0.002, 0.001, 0.001],
       [0.   , 0.   , 0.   , 0.   ],
       [0.001, 0.111, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   ],
       [0.001, 0.002, 0.002, 0.49 ],
       [0.004, 0.452, 0.003, 0.   ],
       [0.766, 0.001, 0.001, 0.   ],
       [0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.483, 0.008],
       [0.   , 0.979, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   ]])


In [18]:
action_at_state = []
action_set = ['Left', 'Down', 'Right', 'Up']

for i in range(len(Q)) :
    if np.sum(Q[i]) == 0 :
        action_at_state.append('hole or goal state')
    else :
        idx = np.argmax(Q[i])
        action_at_state.append(action_set[idx])
        
pprint(action_at_state)        

['Left',
 'Up',
 'Up',
 'Up',
 'Left',
 'hole or goal state',
 'Down',
 'hole or goal state',
 'Up',
 'Down',
 'Left',
 'hole or goal state',
 'hole or goal state',
 'Right',
 'Down',
 'hole or goal state']


## Q-learning with NEURAL NETWORK

In [19]:
import tensorflow as tf

In [20]:
env = gym.make('FrozenLake-v0')

In [21]:
tf.reset_default_graph()

In [22]:
inputs1 = tf.placeholder(shape=[1,16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16,4], 0, 0.01))
Q_out = tf.matmul(inputs1, W)
predict = tf.argmax(Q_out, 1)

In [23]:
next_Q = tf.placeholder(shape=[1,4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(next_Q-Q_out))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

In [29]:
init = tf.global_variables_initializer()

y = 0.99
e = 0.1
num_episodes = 2000

jList = []
rList = []

with tf.Session() as sess :
    sess.run(init)
    
    for i in range(num_episodes) :
        s = env.reset()
        rAll = 0
        d = False
        step = 0
        
        while step < 500 :
            step += 1
            a, all_Q = sess.run([predict, Q_out], feed_dict={inputs1: np.identity(16)[s:s+1]})
            
            if np.random.rand(1) < e :
                a[0] = env.action_space.sample()
                
            s1, r, d, _ = env.step(a[0])
            
            Q1 = sess.run(Q_out, feed_dict={inputs1: np.identity(16)[s:s+1]})
            
            max_Q1 = np.max(Q1)
            target_Q = all_Q
            target_Q[0, a[0]] = r+y*max_Q1
            
            _, W1 = sess.run([updateModel, W], feed_dict={inputs1: np.identity(16)[s:s+1], next_Q: target_Q})
            
            rAll += r
            s = s1
            
            if d :
                e = 1./((i+50)+10)
                break
                
        jList.append(step)
        rList.append(rAll)

In [30]:
print("Score: ", str(sum(rList)/num_episodes))

Score:  0.0105
