# 01. Tabular Q Learning

Tabular Q Learning을 실습해본다. 모든 state의 value function을 table에 저장하고 테이블의 각 요소를 Q Learning으로 업데이트 하는 것으로 학습한다.

## Collab 용 package 설치 코드

In [None]:
!pip install gym

In [1]:
import tensorflow as tf
import numpy as np
import random
import gym
# from gym.wrappers import Monitor

np.random.seed(28)
tf.set_random_seed(28)

print("tensorflow version: ", tf.__version__)
print("gym version: ", gym.__version__)

  from ._conv import register_converters as _register_converters


tensorflow version:  1.7.0
gym version:  0.10.5


## Frozen Lake

**[state]**

        SFFF
        FHFH
        FFFH
        HFFG

    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located
    
**[action]**

    LEFT = 0
    DOWN = 1
    RIGHT = 2
    UP = 3

In [2]:
from IPython.display import clear_output

# Load Environment
env = gym.make("FrozenLake-v0")
# init envrionmnet
env.reset()
# only 'Right' action agent
for _ in range(5):
    env.render()
    next_state, reward, done, _ = env.step(2)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG


### Frozen Lake (not Slippery)

In [3]:
def register_frozen_lake_not_slippery(name):
    from gym.envs.registration import register
    register(
        id=name,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )

register_frozen_lake_not_slippery('FrozenLakeNotSlippery-v0')

In [4]:
env = gym.make("FrozenLakeNotSlippery-v0")
env.reset()
env.render()
'''
env.step()을 이용해서 Goal까지 직접 이동해보세요.
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
'''
env.step(0); env.render()
# env.step(); env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG


## Q-Learning
**Pseudo code**  
<img src="./img/qlearning_pseudo.png" width="60%" align="left">  

### Epsilon greedy

In [31]:
# epsilon greedy policy

def epsilon_greedy_action(epsilon, n_action, state, q_table):
    
        # 구현해보세요.
        
        return action

In [35]:
# epsilon greedy test

epsilon = 0
q_table = np.array([[1,0,0,0],
                            [0,0,0,1],
                            [0,1,0,0]])
for state in range(3):
    action = epsilon_greedy_action(epsilon, 4, state, q_table)
    print("state: {}    action: {}".format(state, action))

0
state: 0    action: 0
0
state: 1    action: 3
1
state: 2    action: 3


### Q-value update

In [24]:
def q_update(q_table, state, next_state, action, reward, alpha, gamma):
    
    # 구현해보세요.
    # update 수식은 pseudo code 참조
    
    return q_table

In [33]:
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

q_table = np.array([[0,0,0,0],
                             [0,1,0,0]], dtype=np.float)
print("start\n", q_table)

reward = 1.0
alpha = 0.1
gamma = 0.9

for i in range(10):
    print("update {}".format(i))
    q_table = q_update(q_table, 0, 1, 2, reward, alpha, gamma)
    print(q_table)

start
 [[ 0.000  0.000  0.000  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 0
[[ 0.000  0.000  0.190  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 1
[[ 0.000  0.000  0.361  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 2
[[ 0.000  0.000  0.515  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 3
[[ 0.000  0.000  0.653  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 4
[[ 0.000  0.000  0.778  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 5
[[ 0.000  0.000  0.890  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 6
[[ 0.000  0.000  0.991  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 7
[[ 0.000  0.000  1.082  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 8
[[ 0.000  0.000  1.164  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 9
[[ 0.000  0.000  1.238  0.000]
 [ 0.000  1.000  0.000  0.000]]


### Agent class

## Goal에 도착하기 위해 생각해야 하는것
1. Goal에 한번이라도 도착해야만 reward가 나와서 update 된다 $\rightarrow$ goal에 어떻게 가게 할까?
2. np.argmax로 Q값이 가장 큰 것을 고르는데 같은 Q값일 경우 무조껀 작은 index를 고른다. $\rightarrow$ 같은 Q값일 경우 랜덤하게 고르게 해야 exploration 한다.
3. hole에 빠졌을 때 episode가 끝나긴 하지만 reward에 차이는 없다. $\rightarrow$ hole에 빠져서 끝나면 negative reward를 주도록 한다.
4. 학습이 잘 되어도 epsilon 만큼의 확률로 random action을 한다. $\rightarrow$ 학습이 진행될수록 epsilon을 줄인다.

In [None]:
class Tabular_Q_agent:
    def __init__(self, q_table, n_action, epsilon, alpha, gamma):
        self.q_table = q_table
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.n_action = n_action
    
    def get_action(self, state):
    
        # 구현해보세요. (e-greedy policy)
        
        return action
    
    def q_update(self, state, next_state, action, reward):
    
        # 구현해보세요.
        # update 수식은 pseudo code 참조
    
    return q_table

### Training agent

In [82]:
env = gym.make("FrozenLakeNotSlippery-v0")

EPISODE = 500
epsilon = 0.9
alpha = 0.8 # learning rate
gamma = 0.9 # discount factor
n_action = env.action_space.n

is_render = False

# initialize Q-Table 
q_table = np.zeros([env.observation_space.n, env.action_space.n])
print("Q table size: ", q_table.shape)

# agent 생성
agent = Tabular_Q_agent(q_table, n_action, epsilon, alpha, gamma)

# Epiode 수만큼 반복
for e in range(EPISODE):
    state = env.reset()
    print("[Episode {}]".format(e))
    if is_render:
        env.render()
    
    total_reward = 0
    done = False
    limit = 0
    
    # 게임이 끝날때까지 반복 또는 100번 step할 때까지 반복
    while not done and limit < 100:
        # 1. select action by e-greedy policy
        # e-greedy로 action을 선택.
            
        # 2. do action and go to next state
        # env.step()을 사용해 1 step 이동 후 next state와 reward, done 값을 받아옴.
        
        # 3. Q update
        # Q table에서 현재 state의 Q값을 update 한다.
        
        state = next_state
        total_reward += reward
        limit += 1
        
    print("total reward: ", total_reward)


Q table size:  (16, 4)
[Episode 0]
total reward:  -1.0
[Episode 1]
total reward:  -1.0
[Episode 2]
total reward:  -1.0
[Episode 3]
total reward:  -1.0
[Episode 4]
total reward:  -1.0
[Episode 5]
total reward:  -1.0
[Episode 6]
total reward:  -1.0
[Episode 7]
total reward:  -1.0
[Episode 8]
total reward:  -1.0
[Episode 9]
total reward:  -1.0
[Episode 10]
total reward:  -1.0
[Episode 11]
total reward:  -1.0
[Episode 12]
total reward:  -1.0
[Episode 13]
total reward:  -1.0
[Episode 14]
total reward:  -1.0
[Episode 15]
total reward:  -1.0
[Episode 16]
total reward:  -1.0
[Episode 17]
total reward:  -1.0
[Episode 18]
total reward:  -1.0
[Episode 19]
total reward:  -1.0
[Episode 20]
total reward:  -1.0
[Episode 21]
total reward:  -1.0
[Episode 22]
total reward:  -1.0
[Episode 23]
total reward:  -1.0
[Episode 24]
total reward:  -1.0
[Episode 25]
total reward:  -1.0
[Episode 26]
total reward:  -1.0
[Episode 27]
GOAL
total reward:  1.0
[Episode 28]
total reward:  -1.0
[Episode 29]
total reward:

GOAL
total reward:  1.0
[Episode 263]
GOAL
total reward:  1.0
[Episode 264]
GOAL
total reward:  1.0
[Episode 265]
GOAL
total reward:  1.0
[Episode 266]
GOAL
total reward:  1.0
[Episode 267]
GOAL
total reward:  1.0
[Episode 268]
GOAL
total reward:  1.0
[Episode 269]
GOAL
total reward:  1.0
[Episode 270]
GOAL
total reward:  1.0
[Episode 271]
GOAL
total reward:  1.0
[Episode 272]
GOAL
total reward:  1.0
[Episode 273]
GOAL
total reward:  1.0
[Episode 274]
GOAL
total reward:  1.0
[Episode 275]
GOAL
total reward:  1.0
[Episode 276]
GOAL
total reward:  1.0
[Episode 277]
GOAL
total reward:  1.0
[Episode 278]
GOAL
total reward:  1.0
[Episode 279]
GOAL
total reward:  1.0
[Episode 280]
GOAL
total reward:  1.0
[Episode 281]
GOAL
total reward:  1.0
[Episode 282]
GOAL
total reward:  1.0
[Episode 283]
GOAL
total reward:  1.0
[Episode 284]
GOAL
total reward:  1.0
[Episode 285]
GOAL
total reward:  1.0
[Episode 286]
GOAL
total reward:  1.0
[Episode 287]
GOAL
total reward:  1.0
[Episode 288]
GOAL
total r

In [83]:
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
print(agent.q_table)

[[ 0.230  0.590  0.000  0.516]
 [ 0.425 -1.000  0.000  0.000]
 [ 0.000  0.000  0.000  0.000]
 [ 0.000 -0.960  0.000  0.000]
 [ 0.589  0.656 -1.000  0.508]
 [ 0.000  0.000  0.000  0.000]
 [-0.800  0.000 -0.960  0.000]
 [ 0.000  0.000  0.000  0.000]
 [ 0.000 -1.000  0.729  0.560]
 [ 0.525  0.810  0.776 -0.800]
 [ 0.000  0.897  0.000  0.000]
 [ 0.000  0.000  0.000  0.000]
 [ 0.000  0.000  0.000  0.000]
 [-0.998  0.804  0.900  0.358]
 [ 0.810  0.893  1.000  0.000]
 [ 0.000  0.000  0.000  0.000]]


### Test agent

In [84]:
state = env.reset()
done = False
limit = 0

agent.epsilon = 0.0
while not done and limit < 30:
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    env.render()
    state = next_state
    limit += 1

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


### 추가 slippery == True 환경에서 Goal로 가는 agent 학습시키기

In [109]:
env = gym.make("FrozenLake-v0")

# 위의 agent 코드를 복사해서 해봅시다.


Q table size:  (16, 4)
[Episode 0]
total reward:  -0.03
[Episode 1]
total reward:  -0.02
[Episode 2]
total reward:  -0.03
[Episode 3]
total reward:  -0.04
[Episode 4]
total reward:  -0.05
[Episode 5]
total reward:  -0.060000000000000005
[Episode 6]
total reward:  -0.17
[Episode 7]
total reward:  -0.02
[Episode 8]
total reward:  -0.060000000000000005
[Episode 9]
total reward:  -0.07
[Episode 10]
total reward:  -0.02
[Episode 11]
total reward:  -0.05
[Episode 12]
total reward:  -0.03
[Episode 13]
total reward:  -0.09999999999999999
[Episode 14]
total reward:  -0.07
[Episode 15]
total reward:  -0.060000000000000005
[Episode 16]
total reward:  -0.07
[Episode 17]
total reward:  -0.07
[Episode 18]
total reward:  -0.02
[Episode 19]
total reward:  -0.02
[Episode 20]
total reward:  -0.05
[Episode 21]
total reward:  -0.04
[Episode 22]
total reward:  -0.08
[Episode 23]
total reward:  -0.05
[Episode 24]
total reward:  -0.09999999999999999
[Episode 25]
total reward:  -0.10999999999999999
[Episode 2

total reward:  -0.2700000000000001
[Episode 372]
total reward:  -0.09999999999999999
[Episode 373]
total reward:  -0.04
[Episode 374]
total reward:  -0.04
[Episode 375]
total reward:  -0.04
[Episode 376]
total reward:  -0.13999999999999999
[Episode 377]
total reward:  -0.04
[Episode 378]
total reward:  -0.03
[Episode 379]
total reward:  -0.10999999999999999
[Episode 380]
total reward:  -0.10999999999999999
[Episode 381]
total reward:  -0.02
[Episode 382]
total reward:  -0.07
[Episode 383]
total reward:  -0.04
[Episode 384]
total reward:  -0.22000000000000006
[Episode 385]
total reward:  -0.08
[Episode 386]
total reward:  -0.04
[Episode 387]
total reward:  -0.13999999999999999
[Episode 388]
total reward:  -0.07
[Episode 389]
total reward:  -0.060000000000000005
[Episode 390]
total reward:  -0.07
[Episode 391]
total reward:  -0.12999999999999998
[Episode 392]
total reward:  -0.07
[Episode 393]
total reward:  -0.060000000000000005
[Episode 394]
total reward:  -0.05
[Episode 395]
total rew

total reward:  -0.09
[Episode 630]
total reward:  -0.3200000000000001
[Episode 631]
total reward:  -0.04
[Episode 632]
total reward:  -0.10999999999999999
[Episode 633]
total reward:  -0.05
[Episode 634]
total reward:  -0.03
[Episode 635]
total reward:  -0.09999999999999999
[Episode 636]
total reward:  -0.13999999999999999
[Episode 637]
total reward:  -0.04
[Episode 638]
total reward:  -0.16
[Episode 639]
total reward:  -0.18000000000000002
[Episode 640]
total reward:  -0.02
[Episode 641]
total reward:  -0.09
[Episode 642]
total reward:  -0.060000000000000005
[Episode 643]
total reward:  -0.10999999999999999
[Episode 644]
total reward:  -0.04
[Episode 645]
total reward:  -0.03
[Episode 646]
total reward:  -0.04
[Episode 647]
total reward:  -0.08
[Episode 648]
total reward:  -0.07
[Episode 649]
total reward:  -0.17
[Episode 650]
total reward:  -0.09
[Episode 651]
total reward:  -0.07
[Episode 652]
total reward:  -0.060000000000000005
[Episode 653]
total reward:  -0.07
[Episode 654]
GOAL

total reward:  -0.10999999999999999
[Episode 846]
GOAL
total reward:  0.9
[Episode 847]
total reward:  -0.05
[Episode 848]
total reward:  -0.11999999999999998
[Episode 849]
total reward:  -0.03
[Episode 850]
total reward:  -0.09
[Episode 851]
total reward:  -0.15
[Episode 852]
total reward:  -0.09999999999999999
[Episode 853]
total reward:  -0.10999999999999999
[Episode 854]
total reward:  -0.11999999999999998
[Episode 855]
total reward:  -0.03
[Episode 856]
total reward:  -0.04
[Episode 857]
total reward:  -0.060000000000000005
[Episode 858]
total reward:  -0.13999999999999999
[Episode 859]
total reward:  -0.08
[Episode 860]
GOAL
total reward:  0.94
[Episode 861]
GOAL
total reward:  0.82
[Episode 862]
total reward:  -0.09
[Episode 863]
total reward:  -0.05
[Episode 864]
total reward:  -0.05
[Episode 865]
total reward:  -0.20000000000000004
[Episode 866]
total reward:  -0.18000000000000002
[Episode 867]
total reward:  -0.21000000000000005
[Episode 868]
total reward:  -0.09
[Episode 869

GOAL
total reward:  0.9299999999999999
[Episode 1108]
total reward:  -0.03
[Episode 1109]
total reward:  -0.02
[Episode 1110]
total reward:  -0.09
[Episode 1111]
total reward:  -0.10999999999999999
[Episode 1112]
total reward:  -0.03
[Episode 1113]
total reward:  -0.04
[Episode 1114]
total reward:  -0.10999999999999999
[Episode 1115]
total reward:  -0.19000000000000003
[Episode 1116]
total reward:  -0.02
[Episode 1117]
total reward:  -0.04
[Episode 1118]
total reward:  -0.02
[Episode 1119]
total reward:  -0.060000000000000005
[Episode 1120]
total reward:  -0.05
[Episode 1121]
total reward:  -0.18000000000000002
[Episode 1122]
GOAL
total reward:  0.9
[Episode 1123]
total reward:  -0.07
[Episode 1124]
total reward:  -0.03
[Episode 1125]
total reward:  -0.02
[Episode 1126]
total reward:  -0.16
[Episode 1127]
total reward:  -0.02
[Episode 1128]
total reward:  -0.16
[Episode 1129]
total reward:  -0.02
[Episode 1130]
total reward:  -0.02
[Episode 1131]
total reward:  -0.19000000000000003
[Ep

total reward:  -0.07
[Episode 1332]
total reward:  -0.03
[Episode 1333]
total reward:  -0.09
[Episode 1334]
total reward:  -0.02
[Episode 1335]
total reward:  -0.03
[Episode 1336]
total reward:  -0.08
[Episode 1337]
total reward:  -0.09999999999999999
[Episode 1338]
total reward:  -0.05
[Episode 1339]
total reward:  -0.03
[Episode 1340]
total reward:  -0.060000000000000005
[Episode 1341]
total reward:  -0.07
[Episode 1342]
GOAL
total reward:  0.95
[Episode 1343]
total reward:  -0.04
[Episode 1344]
total reward:  -0.11999999999999998
[Episode 1345]
total reward:  -0.060000000000000005
[Episode 1346]
total reward:  -0.04
[Episode 1347]
total reward:  -0.03
[Episode 1348]
total reward:  -0.11999999999999998
[Episode 1349]
total reward:  -0.02
[Episode 1350]
total reward:  -0.05
[Episode 1351]
total reward:  -0.07
[Episode 1352]
total reward:  -0.04
[Episode 1353]
total reward:  -0.02
[Episode 1354]
total reward:  -0.02
[Episode 1355]
total reward:  -0.04
[Episode 1356]
total reward:  -0.0

GOAL
total reward:  0.9
[Episode 1708]
total reward:  -0.08
[Episode 1709]
total reward:  -0.03
[Episode 1710]
total reward:  -0.05
[Episode 1711]
total reward:  -0.03
[Episode 1712]
total reward:  -0.02
[Episode 1713]
total reward:  -0.05
[Episode 1714]
total reward:  -0.02
[Episode 1715]
total reward:  -0.03
[Episode 1716]
total reward:  -0.060000000000000005
[Episode 1717]
total reward:  -0.02
[Episode 1718]
total reward:  -0.02
[Episode 1719]
total reward:  -0.09999999999999999
[Episode 1720]
GOAL
total reward:  0.7799999999999999
[Episode 1721]
GOAL
total reward:  0.7799999999999999
[Episode 1722]
total reward:  -0.03
[Episode 1723]
total reward:  -0.10999999999999999
[Episode 1724]
total reward:  -0.05
[Episode 1725]
total reward:  -0.07
[Episode 1726]
total reward:  -0.07
[Episode 1727]
total reward:  -0.08
[Episode 1728]
total reward:  -0.02
[Episode 1729]
total reward:  -0.060000000000000005
[Episode 1730]
total reward:  -0.02
[Episode 1731]
total reward:  -0.03
[Episode 1732]

In [110]:
print(agent.q_table)

[[-0.061 -0.060 -0.020 -0.062]
 [-0.028 -0.061 -0.061 -0.059]
 [-0.054 -0.021 -0.054 -0.052]
 [-0.053 -0.046 -0.010 -0.046]
 [-0.050 -0.010 -0.056 -0.055]
 [ 0.000  0.000  0.000  0.000]
 [-0.035 -0.010 -0.034 -0.028]
 [ 0.000  0.000  0.000  0.000]
 [-0.047  0.027 -0.046 -0.046]
 [-0.026 -0.028  0.099 -0.027]
 [-0.027  0.019 -0.026 -0.024]
 [ 0.000  0.000  0.000  0.000]
 [ 0.000  0.000  0.000  0.000]
 [ 0.001  0.016  0.297  0.019]
 [ 0.096  0.917  0.126  0.090]
 [ 0.000  0.000  0.000  0.000]]


In [115]:
state = env.reset()
done = False
limit = 0
agent.epsilon = 0.0

while not done and limit < 30:
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    env.render()
    state = next_state
    limit += 1

  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
