# 01. Tabular Q Learning example

Tabular Q Learning을 실습해봅니다.
- 모든 state의 value function을 table에 저장하고 테이블의 각 요소를 Q Learning으로 업데이트 하는 것으로 학습합니다.

## Collab 용 package 설치 코드

In [None]:
!pip install gym

### package import

In [1]:
import tensorflow as tf
import numpy as np
import random
import gym
# from gym.wrappers import Monitor

np.random.seed(285)
tf.set_random_seed(285)

print("tensorflow version: ", tf.__version__)
print("gym version: ", gym.__version__)

  from ._conv import register_converters as _register_converters


tensorflow version:  1.7.0
gym version:  0.10.5


## Frozen Lake

**[state]**

        SFFF
        FHFH
        FFFH
        HFFG

    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located
    
**[action]**

    LEFT = 0
    DOWN = 1
    RIGHT = 2
    UP = 3

In [2]:
from IPython.display import clear_output

# Load Environment
env = gym.make("FrozenLake-v0")
# init envrionmnet
env.reset()
# only 'Right' action agent
for _ in range(5):
    env.render()
    env.step(2)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


### Frozen Lake (not Slippery)

In [3]:
def register_frozen_lake_not_slippery(name):
    from gym.envs.registration import register
    register(
        id=name,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )

register_frozen_lake_not_slippery('FrozenLakeNotSlippery-v0')

In [4]:
env = gym.make("FrozenLakeNotSlippery-v0")
env.reset()
env.render()
env.step(2)
env.step(2)
env.step(1)
env.step(1)
env.step(1)
_,r,_,_=env.step(2)
env.render()
print(r)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
1.0


## Q-Learning
**Pseudo code**  
<img src="./img/qlearning_pseudo.png" width="60%" align="left">  

### Epsilon greedy

In [5]:
# epsilon greedy policy

def epsilon_greedy_action(epsilon, n_action, state, q_table):
    
        if epsilon > np.random.random():
            action = env.action_space.sample()
    
        else:
            # 가장 큰 Q값을 갖는 action을 고른다. 같은 action이 있으면 랜덤으로.
            action = np.argmax(q_table[state, :])
        
        return action

In [6]:
# epsilon greedy test

epsilon = 0
q_table = np.array([[1,0,0,0],
                            [0,0,0,1],
                            [0,1,0,0]])
for state in range(3):
    action = epsilon_greedy_action(epsilon, 4, state, q_table)
    print("state: {}    action: {}".format(state, action))

state: 0    action: 0
state: 1    action: 3
state: 2    action: 1


### Q-value update

In [17]:
def q_update(q_table, state, next_state, action, reward, alpha, gamma):
    
    # 구현해보세요.
    # update 수식은 pseudo code 참조
    q_table[state, action] = q_table[state, action] + \
                            alpha * ((reward + gamma * np.max(q_table[next_state, :])) - q_table[state, action])
    return q_table

In [19]:
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

q_table = np.array([[0,0,0,0],
                             [0,1,0,0]], dtype=np.float)
print("start\n", q_table)

reward = 1.0
alpha = 0.1
gamma = 0.9

for i in range(10):
    print("update {}".format(i))
    q_table = q_update(q_table, 0, 1, 2, reward, alpha, gamma)
    print(q_table)

start
 [[ 0.000  0.000  0.000  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 0
[[ 0.000  0.000  0.190  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 1
[[ 0.000  0.000  0.361  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 2
[[ 0.000  0.000  0.515  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 3
[[ 0.000  0.000  0.653  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 4
[[ 0.000  0.000  0.778  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 5
[[ 0.000  0.000  0.890  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 6
[[ 0.000  0.000  0.991  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 7
[[ 0.000  0.000  1.082  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 8
[[ 0.000  0.000  1.164  0.000]
 [ 0.000  1.000  0.000  0.000]]
update 9
[[ 0.000  0.000  1.238  0.000]
 [ 0.000  1.000  0.000  0.000]]


### Agent class

In [20]:
class Tabular_Q_agent:
    def __init__(self, q_table, n_action, epsilon, alpha, gamma):
        self.q_table = q_table
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.n_action = n_action
    
    def get_action(self, state):
    
        if self.epsilon > np.random.random():
            action = env.action_space.sample()
    
        else:
            # 가장 큰 Q값을 갖는 action을 고른다. 같은 action이 있으면 랜덤으로.
            action = np.argmax(self.q_table[state, :])
            # action = np.random.choice(np.flatnonzero(self.q_table[state, :] == self.q_table[state, :].max()))
            
        if self.epsilon > 0.01:
            self.epsilon -= 0.001
        
        return action
    
    def q_update(self, state, next_state, action, reward):
    
        self.q_table[state, action] = self.q_table[state, action] + \
                                               self.alpha * ((reward + self.gamma * np.max(self.q_table[next_state, :])) - self.q_table[state, action])
        

### Training agent

In [21]:
env = gym.make("FrozenLakeNotSlippery-v0")

EPISODE = 2000
epsilon = 0.9
alpha = 0.8 # learning rate
gamma = 0.9 # discount factor
n_action = env.action_space.n

rlist = []
slist = []

is_render = False

# initialize Q-Table 
q_table = np.random.rand(env.observation_space.n, env.action_space.n)
print("Q table size: ", q_table.shape)

# agent 생성
agent = Tabular_Q_agent(q_table, n_action, epsilon, alpha, gamma)

for e in range(EPISODE):
    state = env.reset()
    print("[Episode {}]".format(e))
    if is_render:
        env.render()
    
    total_reward = 0
    goal = 0
    done = False
    limit = 0
    while not done and limit < 99:
        # select action by e-greedy policy
        action = agent.get_action(state)
            
        # do action and go to next state
        next_state, reward, done, _ = env.step(action)
        if is_render:
            env.render()
            
        if reward == 1.0:
            print("GOAL")
            goal = 1
        elif done:
            reward = reward - 1
        
        # Q update
        agent.q_update(state, next_state, action, reward)
        
        slist.append(state)
        state = next_state
        
        total_reward += reward
        limit += 1
        
    print(slist)
    slist = []
    print("total reward: ", total_reward)
    rlist.append(goal)
    
print("성공한 확률" + str(sum(rlist) / EPISODE) + "%")

Q table size:  (16, 4)
[Episode 0]
[0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 4, 0, 4]
total reward:  -1.0
[Episode 1]
[0, 0, 0, 0, 1, 0, 0, 0, 1]
total reward:  -1.0
[Episode 2]
[0, 4]
total reward:  -1.0
[Episode 3]
[0, 0, 0, 1, 0, 4, 8]
total reward:  -1.0
[Episode 4]
[0, 4, 4, 8, 8, 4, 4, 0, 4, 0, 4]
total reward:  -1.0
[Episode 5]
[0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 4, 0, 0, 1, 1, 0, 0, 4, 8]
total reward:  -1.0
[Episode 6]
[0, 0, 0, 0, 4]
total reward:  -1.0
[Episode 7]
[0, 0, 0, 4, 0, 1, 1, 2, 2, 1, 2, 1, 2, 1, 0, 4, 4, 8]
total reward:  -1.0
[Episode 8]
[0, 1, 0, 0, 4, 0, 0, 4]
total reward:  -1.0
[Episode 9]
[0, 1, 1, 0, 1]
total reward:  -1.0
[Episode 10]
[0, 4, 0, 0, 1]
total reward:  -1.0
[Episode 11]
[0, 4, 0, 1, 1, 1, 2, 3, 3]
total reward:  -1.0
[Episode 12]
[0, 0, 0, 1, 1]
total reward:  -1.0
[Episode 13]
[0, 0, 4]
total reward:  -1.0
[Episode 14]
GOAL
[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 6, 10, 14]
total reward:  1.0
[Episode 15]
[0, 0, 4, 8, 4

GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 282]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 283]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 284]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 285]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 286]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 287]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 288]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 289]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 290]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 291]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 292]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 293]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 294]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 295]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 296]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 297]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 29

GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 495]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 496]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 497]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 498]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 499]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 500]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 501]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 502]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 503]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 504]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 505]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 506]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 507]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 508]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 509]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 510]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 51

GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 776]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 777]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 778]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 779]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 780]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 781]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 782]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 783]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 784]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 785]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 786]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 787]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 788]
GOAL
[0, 1, 2, 6, 10, 14, 14]
total reward:  1.0
[Episode 789]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 790]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 791]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episod

GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 995]
GOAL
[0, 1, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 996]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 997]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 998]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 999]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1000]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1001]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1002]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1003]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1004]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1005]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1006]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1007]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1008]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1009]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1010]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1

GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1372]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1373]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1374]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1375]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1376]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1377]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1378]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1379]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1380]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1381]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1382]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1383]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1384]
GOAL
[0, 0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1385]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1386]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1387]
GOAL
[0, 1, 2, 6, 10, 14]
total rewar

total reward:  1.0
[Episode 1677]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1678]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1679]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1680]
GOAL
[0, 1, 2, 6, 10, 9, 10, 14]
total reward:  1.0
[Episode 1681]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1682]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1683]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1684]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1685]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1686]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1687]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1688]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1689]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1690]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1691]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1692]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1693]

[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1950]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1951]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1952]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1953]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1954]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1955]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1956]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1957]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1958]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1959]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1960]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1961]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1962]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1963]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1964]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0
[Episode 1965]
GOAL
[0, 1, 2, 6, 10, 14]
total reward:  1.0


In [22]:
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
print(agent.q_table)

[[ 0.938  0.837  1.042  0.938]
 [ 0.938 -0.474  1.158  1.042]
 [ 1.042  1.287  1.021  1.158]
 [ 1.156 -0.140  0.270  0.467]
 [ 0.417  0.619 -0.474  0.938]
 [ 0.456  0.275  0.585  0.538]
 [-0.474  1.430 -0.142  1.158]
 [ 0.954  0.951  0.807  0.835]
 [ 0.504 -0.725  0.501  0.444]
 [ 0.505  0.539  1.430 -0.422]
 [ 1.281  1.589 -0.639  1.286]
 [ 0.401  0.085  0.031  0.049]
 [ 0.306  0.171  0.034  0.173]
 [-0.527  0.465  1.589  0.489]
 [ 1.428  1.589  1.765  1.430]
 [ 0.850  0.651  0.010  0.638]]


### Test agent

In [23]:
state = env.reset()
done = False
limit = 0

agent.epsilon = 0.0
while not done and limit < 30:
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    env.render()
    state = next_state
    limit += 1

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


## Goal에 도착하기 위해 생각해야 하는것
1. Goal에 한번이라도 도착해야만 reward가 나와서 update 된다 $\rightarrow$ goal에 어떻게 가게 할까?
2. np.argmax로 Q값이 가장 큰 것을 고르는데 같은 Q값일 경우 무조껀 작은 index를 고른다. $\rightarrow$ 같은 Q값일 경우 랜덤하게 고르게 해야 exploration 한다.
3. hole에 빠졌을 때 episode가 끝나긴 하지만 reward에 차이는 없다. $\rightarrow$ hole에 빠져서 끝나면 negative reward를 주도록 한다.
4. 학습이 잘 되어도 epsilon 만큼의 확률로 random action을 한다. $\rightarrow$ 학습이 진행될수록 epsilon을 줄인다.

### 추가 slippery == True 환경에서 Goal로 가는 agent 학습시키기

In [24]:
class Tabular_Q_agent:
    def __init__(self, q_table, n_action, epsilon, alpha, gamma):
        self.q_table = q_table
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.n_action = n_action
    
    def get_action(self, state):
    
        if self.epsilon > np.random.random():
            action = env.action_space.sample()
    
        else:
            # 가장 큰 Q값을 갖는 action을 고른다. 같은 action이 있으면 랜덤으로.
            action = np.argmax(self.q_table[state, :])
            
        if self.epsilon > 0.01:
            self.epsilon -= 0.00001
        
        return action
    
    def q_update(self, state, next_state, action, reward):
    
        self.q_table[state, action] = self.q_table[state, action] + \
                                               self.alpha * ((reward + self.gamma * np.max(self.q_table[next_state, :])) - self.q_table[state, action])

In [25]:
env = gym.make("FrozenLake-v0")
# env = Monitor(env, './video')

EPISODE = 2000
epsilon = 0.9
alpha = 0.8 # learning rate
gamma = 0.9 # discount factor
n_action = env.action_space.n

rlist = []
slist = []

is_render = False

# initialize Q-Table 
q_table = np.random.rand(env.observation_space.n, env.action_space.n)
print("Q table size: ", q_table.shape)

# agent 생성
agent = Tabular_Q_agent(q_table, n_action, epsilon, alpha, gamma)

for e in range(EPISODE):
    state = env.reset()
    print("[Episode {}]".format(e))
    if is_render:
        env.render()
    
    total_reward = 0
    goal = 0
    done = False
    limit = 0
    while not done and limit < 99:
        # select action by e-greedy policy
        action = agent.get_action(state)
            
        # do action and go to next state
        next_state, reward, done, _ = env.step(action)
        if is_render:
            env.render()
            
        if reward == 1.0:
            print("GOAL")
            goal = 1
        elif reward == 0.0:
            reward = reward - 0.01
        elif done:
            reward = reward - 0.5
        
        # Q update
        agent.q_update(state, next_state, action, reward)
        
        slist.append(state)
        state = next_state
        total_reward += reward
        limit += 1
        
    print(slist)
    slist = []
    print("total reward: ", total_reward)
    rlist.append(goal)
    
print("성공한 확률" + str(sum(rlist) / EPISODE) + "%")

Q table size:  (16, 4)
[Episode 0]
[0, 0, 4]
total reward:  -0.03
[Episode 1]
[0, 0, 0, 4, 0, 0, 1, 0, 4, 8, 4, 8]
total reward:  -0.11999999999999998
[Episode 2]
[0, 4, 4, 8]
total reward:  -0.04
[Episode 3]
[0, 4, 4]
total reward:  -0.03
[Episode 4]
[0, 1, 1]
total reward:  -0.03
[Episode 5]
[0, 0, 4, 8, 9, 13]
total reward:  -0.060000000000000005
[Episode 6]
[0, 4]
total reward:  -0.02
[Episode 7]
[0, 0, 0, 0, 0, 0, 4, 8, 4, 4, 4]
total reward:  -0.10999999999999999
[Episode 8]
[0, 4, 8, 8, 4, 4, 8, 4, 8]
total reward:  -0.09
[Episode 9]
[0, 1]
total reward:  -0.02
[Episode 10]
[0, 0, 0, 0, 4, 0, 4, 4]
total reward:  -0.08
[Episode 11]
[0, 1, 0, 0, 0, 4]
total reward:  -0.060000000000000005
[Episode 12]
[0, 0, 1, 1, 2, 6]
total reward:  -0.060000000000000005
[Episode 13]
[0, 1, 0, 1, 1, 1, 1, 0, 4]
total reward:  -0.09
[Episode 14]
[0, 4, 8]
total reward:  -0.03
[Episode 15]
[0, 4, 4]
total reward:  -0.03
[Episode 16]
[0, 0, 0, 0, 4, 0, 4, 4, 0, 4, 4, 0, 1]
total reward:  -0.1299999

[0, 0, 1, 2, 1, 2, 2, 3, 2, 6, 10, 6, 10, 14, 13, 9, 10, 6]
total reward:  -0.18000000000000002
[Episode 240]
[0, 4, 8, 4, 8, 4]
total reward:  -0.060000000000000005
[Episode 241]
[0, 1]
total reward:  -0.02
[Episode 242]
[0, 4, 4, 8, 9, 10]
total reward:  -0.060000000000000005
[Episode 243]
[0, 1, 1, 2, 3, 3, 3, 2, 6, 2, 2, 2, 2, 6, 2, 1]
total reward:  -0.16
[Episode 244]
[0, 4, 0, 0, 1, 2, 6, 2, 1, 2, 1, 0, 0, 0, 4]
total reward:  -0.15
[Episode 245]
[0, 0, 1]
total reward:  -0.03
[Episode 246]
[0, 1, 0, 0, 1]
total reward:  -0.05
[Episode 247]
[0, 1, 2, 6]
total reward:  -0.04
[Episode 248]
[0, 4, 4, 0, 0, 1, 2, 6]
total reward:  -0.08
[Episode 249]
[0, 1, 2, 3]
total reward:  -0.04
[Episode 250]
[0, 4, 4, 4, 8, 4]
total reward:  -0.060000000000000005
[Episode 251]
[0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4, 4, 4, 8, 9]
total reward:  -0.15
[Episode 252]
[0, 4, 8]
total reward:  -0.03
[Episode 253]
[0, 4, 8, 8, 8, 4, 0, 1, 2, 2, 6]
total reward:  -0.10999999999999999
[Episode 254]
[0, 0, 0, 

[Episode 460]
[0, 1, 2, 3]
total reward:  -0.04
[Episode 461]
[0, 1]
total reward:  -0.02
[Episode 462]
[0, 4, 8]
total reward:  -0.03
[Episode 463]
[0, 1, 2, 1, 2, 3, 3, 3, 2, 3, 3, 3, 3]
total reward:  -0.12999999999999998
[Episode 464]
[0, 0, 0, 1, 1, 0, 0, 4, 4, 8, 8, 4, 8, 4, 8, 9, 8, 8, 9]
total reward:  -0.19000000000000003
[Episode 465]
[0, 0, 1, 2, 6]
total reward:  -0.05
[Episode 466]
[0, 4, 0, 1]
total reward:  -0.04
[Episode 467]
[0, 0, 4]
total reward:  -0.03
[Episode 468]
[0, 0, 0, 0, 4, 4, 8, 4]
total reward:  -0.08
[Episode 469]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 470]
[0, 4]
total reward:  -0.02
[Episode 471]
[0, 0, 0, 0, 0, 1]
total reward:  -0.060000000000000005
[Episode 472]
[0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 4]
total reward:  -0.13999999999999999
[Episode 473]
[0, 0, 0, 4]
total reward:  -0.04
[Episode 474]
[0, 1, 2, 6, 2, 1, 2, 6, 2, 6, 10, 9, 10, 9, 13]
total reward:  -0.15
[Episode 475]
[0, 0, 4]
total reward:  -0.03
[Episode 476]
[0, 0, 0, 1, 1, 1, 

[0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 4, 0, 1]
total reward:  -0.21000000000000005
[Episode 740]
[0, 0, 0, 0, 1, 0, 1, 1]
total reward:  -0.08
[Episode 741]
[0, 4, 8]
total reward:  -0.03
[Episode 742]
[0, 1, 2, 3, 3]
total reward:  -0.05
[Episode 743]
[0, 0, 1, 2, 2, 6, 2, 6]
total reward:  -0.08
[Episode 744]
[0, 0, 1]
total reward:  -0.03
[Episode 745]
[0, 0, 1, 1]
total reward:  -0.04
[Episode 746]
[0, 4, 0, 1]
total reward:  -0.04
[Episode 747]
[0, 0, 1, 0, 4, 8, 4]
total reward:  -0.07
[Episode 748]
[0, 4, 0, 0, 4, 4, 0, 1]
total reward:  -0.08
[Episode 749]
[0, 1, 0, 0, 0, 0, 1, 2, 2, 3]
total reward:  -0.09999999999999999
[Episode 750]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 751]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 752]
[0, 0, 0, 1, 2, 6]
total reward:  -0.060000000000000005
[Episode 753]
[0, 0, 0, 4]
total reward:  -0.04
[Episode 754]
[0, 0, 1, 1, 1]
total reward:  -0.05
[Episode 755]
[0, 0, 4, 0, 0, 1]
total reward:  -0.060000000000000005
[Episode 7

[0, 1, 0, 0, 4, 4, 4, 4]
total reward:  -0.08
[Episode 995]
[0, 4, 4]
total reward:  -0.03
[Episode 996]
[0, 4]
total reward:  -0.02
[Episode 997]
[0, 0, 0, 4, 0, 1, 0, 4, 0, 0, 0, 1, 0, 1, 0, 4]
total reward:  -0.16
[Episode 998]
[0, 1, 0, 1, 2, 1, 2, 3]
total reward:  -0.08
[Episode 999]
[0, 0, 1, 2, 1, 2, 3, 3]
total reward:  -0.08
[Episode 1000]
[0, 4, 4, 8, 4, 0, 1, 1]
total reward:  -0.08
[Episode 1001]
[0, 0, 1, 0, 0, 0, 0, 1, 1, 2, 3]
total reward:  -0.10999999999999999
[Episode 1002]
[0, 4, 8, 9, 8, 9, 10, 14, 13]
total reward:  -0.09
[Episode 1003]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 1004]
[0, 1, 2, 3, 3]
total reward:  -0.05
[Episode 1005]
[0, 4, 8, 4]
total reward:  -0.04
[Episode 1006]
[0, 0, 0, 0, 0, 4]
total reward:  -0.060000000000000005
[Episode 1007]
[0, 4, 8, 8, 9, 13]
total reward:  -0.060000000000000005
[Episode 1008]
[0, 0, 1, 2, 1, 2, 2, 1, 2, 3, 3, 3]
total reward:  -0.11999999999999998
[Episode 1009]
[0, 1, 1, 0, 0, 0, 0, 0, 1]
total reward:  -0.09
[Epis

[0, 0, 4, 0, 4]
total reward:  -0.05
[Episode 1278]
[0, 0, 4, 8, 8, 4, 0, 0, 1]
total reward:  -0.09
[Episode 1279]
[0, 0, 4, 4, 4]
total reward:  -0.05
[Episode 1280]
[0, 1, 2, 1]
total reward:  -0.04
[Episode 1281]
[0, 0, 0, 4, 4, 4]
total reward:  -0.060000000000000005
[Episode 1282]
[0, 1, 1, 0, 1, 1]
total reward:  -0.060000000000000005
[Episode 1283]
[0, 1, 2, 2, 1, 1, 1]
total reward:  -0.07
[Episode 1284]
[0, 1, 1, 1]
total reward:  -0.04
[Episode 1285]
[0, 4, 4, 0, 4, 0, 1, 0, 0, 4, 4, 4, 4, 4, 8, 9, 13, 13, 14, 13, 9]
total reward:  -0.21000000000000005
[Episode 1286]
[0, 0, 4, 4, 0, 4, 8, 9, 13, 14, 13]
total reward:  -0.10999999999999999
[Episode 1287]
[0, 4, 4, 8]
total reward:  -0.04
[Episode 1288]
[0, 1, 2, 2, 3, 3]
total reward:  -0.060000000000000005
[Episode 1289]
[0, 0, 4]
total reward:  -0.03
[Episode 1290]
[0, 1, 2, 6, 10, 14, 14, 13, 9, 10, 9, 8]
total reward:  -0.11999999999999998
[Episode 1291]
[0, 1, 1, 2, 3, 3, 2, 2, 6]
total reward:  -0.09
[Episode 1292]
[0, 

[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 4, 8, 8]
total reward:  -0.16
[Episode 1579]
[0, 0, 0, 4]
total reward:  -0.04
[Episode 1580]
[0, 0, 0, 0, 4, 0, 4, 8]
total reward:  -0.08
[Episode 1581]
[0, 1]
total reward:  -0.02
[Episode 1582]
[0, 1]
total reward:  -0.02
[Episode 1583]
[0, 0, 4, 4, 4, 0, 0, 4, 8, 9, 10, 14, 14, 10, 9, 8, 4, 0, 0, 0, 1, 2, 2, 2, 6]
total reward:  -0.25000000000000006
[Episode 1584]
[0, 0, 0, 1, 1, 0, 4, 4, 4]
total reward:  -0.09
[Episode 1585]
[0, 0, 4, 8, 9, 10, 6]
total reward:  -0.07
[Episode 1586]
[0, 0, 4]
total reward:  -0.03
[Episode 1587]
[0, 0, 0, 4, 8]
total reward:  -0.05
[Episode 1588]
GOAL
[0, 0, 4, 4, 4, 8, 9, 13, 13, 13, 14, 14]
total reward:  0.89
[Episode 1589]
[0, 1, 1, 1, 2, 6, 10]
total reward:  -0.07
[Episode 1590]
[0, 1, 0, 1]
total reward:  -0.04
[Episode 1591]
[0, 0, 4]
total reward:  -0.03
[Episode 1592]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 1593]
GOAL
[0, 4, 8, 9, 10, 14, 14]
total reward:  0.94
[Episode 1594]
[0, 0, 0, 0, 0, 0

total reward:  -0.07
[Episode 1830]
[0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 2, 6]
total reward:  -0.11999999999999998
[Episode 1831]
[0, 1, 0, 4, 0, 0, 4]
total reward:  -0.07
[Episode 1832]
[0, 4]
total reward:  -0.02
[Episode 1833]
[0, 0, 1, 2, 2, 3, 2, 1, 1]
total reward:  -0.09
[Episode 1834]
[0, 0, 4]
total reward:  -0.03
[Episode 1835]
[0, 0, 4, 0, 0, 0, 1]
total reward:  -0.07
[Episode 1836]
[0, 4, 8, 4]
total reward:  -0.04
[Episode 1837]
[0, 4, 0, 1]
total reward:  -0.04
[Episode 1838]
[0, 0, 0, 1]
total reward:  -0.04
[Episode 1839]
[0, 0, 4, 8]
total reward:  -0.04
[Episode 1840]
[0, 1, 2, 1, 0, 4]
total reward:  -0.060000000000000005
[Episode 1841]
[0, 0, 0, 1, 0, 4, 8, 9, 8, 4, 8, 8, 8]
total reward:  -0.12999999999999998
[Episode 1842]
[0, 4, 8]
total reward:  -0.03
[Episode 1843]
[0, 1, 0, 0, 1, 0, 0, 4, 4, 4, 0, 4, 8, 9]
total reward:  -0.13999999999999999
[Episode 1844]
[0, 0, 4, 0, 0, 1, 0, 0, 0, 1, 2, 3]
total reward:  -0.11999999999999998
[Episode 1845]
[0, 1, 2, 2, 1, 2, 2,

In [26]:
print(agent.q_table)

[[ 0.604  0.670  0.543  0.570]
 [ 0.628  0.563  0.732  0.585]
 [ 0.606  0.611  0.556  0.483]
 [ 0.475  0.507  0.390  0.427]
 [ 0.699  0.658  0.628  0.726]
 [ 0.377  0.843  0.856  0.030]
 [ 0.629  0.711  0.516  0.573]
 [ 0.441  0.428  0.319  0.064]
 [ 0.616  0.618  0.686  0.930]
 [ 1.092  0.900  1.124  0.793]
 [ 0.851  1.004  0.866  0.638]
 [ 0.305  0.081  0.050  0.632]
 [ 0.693  0.674  0.106  0.192]
 [ 0.687  0.945  1.280  0.759]
 [ 1.293  1.600  1.077  1.090]
 [ 0.425  0.192  0.671  0.410]]


In [27]:
state = env.reset()
done = False
limit = 0
agent.epsilon = 0.0

while not done and limit < 30:
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    env.render()
    state = next_state
    limit += 1

  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
