# DP_Grid-world

David Silver의 3강 Dynamic Programming에 등장하는 Small Grid-world 예제를 구현한 코드입니다.

<img src="imgs/grid-world.jpg">

In [1]:
class Action:
    def __init__(self, policy):
        self.actions = [(-1,0),(1,0),(0,-1),(0,1)]
        self.actions_str = ["up","down","left","right"]
    
    def get_actions(self):
        return self.actions
    
    def get_actions_str(self):
        return self.actions_str

1. Policy를 Env에서 제외하는 방향으로 수정
2. Action 또한 Env에서 제외하는 방향을 수정
3. State를 분리하는 방향으로 Coding

In [2]:
class Env:
    def __init__(self, grid_size=4):
        self.grid_size = grid_size
        self.grid = np.ones([grid_size,grid_size])
        self.terminal_state = [(0,0),(grid_size-1, grid_size-1)]
        self.grid = np.array([0 if (idx_r,idx_c) in self.terminal_state else 1 for idx_r, r in enumerate(self.grid) for idx_c, element in enumerate(r)]).reshape(grid_size,-1)
        self.policy = self.init_policy(self.terminal_state)
        self.action = Action(self.policy)
        
    def init_policy(self,terminal_state):
        policy = np.zeros([self.grid_size,self.grid_size,self.grid_size])
        for idx_r, row in enumerate(policy):
            for idx_c, col in enumerate(row):
                if (idx_r, idx_c) in terminal_state:
                    continue
                else:
                    policy[idx_r,idx_c] = np.array([0.25] * 4)
        return policy
    
    def set_policy(self, policy):
        self.policy = policy
        
    def get_policy(self):
        return self.policy
    
    def get_actions(self):
        return self.action.get_actions()
    
    def step(self, state, action):
        
        if state in self.terminal_state:
            return state, None, 0, True
        
        else:
            row, col = state

            actions = self.action.get_actions()
            actions_str = self.action.get_actions_str()
            idx_action = [idx for idx, act in enumerate(actions) if act == action ][0]
            action_str = actions_str[idx_action]

            act_row, act_col = action
            next_row = row + act_row
            next_col = col + act_col

            if next_row < 0:
                next_row = 0
            elif next_row > self.grid_size-1:
                next_row = self.grid_size-1

            if next_col < 0:
                next_col = 0
            elif next_col > self.grid_size-1:
                next_col = self.grid_size-1

            next_state = (next_row, next_col)

            if next_state in self.terminal_state:
                done = True
            else:
                done = False

            return next_state, action_str, -1, done

In [53]:
import numpy as np

grid_size = 4
episodes = 2
gamma = 1
env = Env(grid_size)
actions = env.get_actions()
policy = env.get_policy()

## Policy Iteration

In [54]:
## Policy Evaluation

post_value_table = np.zeros((grid_size,grid_size))

for episode in range(episodes):
    next_value_table = np.zeros((grid_size,grid_size))
    for row in range(grid_size):
        for col in range(grid_size):
            dummy = 0
            for idx_action, action in enumerate(actions):
                state = (row,col)
                next_state, action_str, reward, done = env.step(state, action)
                dummy += policy[row,col,idx_action] * (reward + gamma * post_value_table[next_state])
            next_value_table[state] = dummy
    post_value_table = next_value_table
    print(post_value_table)

[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]
[[ 0.   -1.75 -2.   -2.  ]
 [-1.75 -2.   -2.   -2.  ]
 [-2.   -2.   -2.   -1.75]
 [-2.   -2.   -1.75  0.  ]]


In [55]:
## Policy Improvement

for row in range(grid_size):
    for col in range(grid_size):
        action_list = list()
        for idx_action, action in enumerate(actions):
            state = (row,col)
            next_state, action_str, reward, done = env.step(state, action)
            action_list.append(post_value_table[next_state])
        
        action_list = np.array(action_list)
        if (action_list == 0).sum() == 4:
            continue
        else:
            greedy_idx = np.argwhere(action_list == action_list.max()).flatten()
            greedy_policy_value = 1.0 / len(greedy_idx)
            updated_policy = np.zeros(len(actions))
            updated_policy[greedy_idx] = greedy_policy_value
            policy[state] = updated_policy

## Value Iteration

In [60]:
import numpy as np

grid_size = 4
episodes = 5
gamma = 1
env = Env(grid_size)
actions = env.get_actions()
policy = env.get_policy()

In [61]:
## Value Iteration

post_value_table = np.zeros((grid_size,grid_size))

for episode in range(episodes):
    next_value_table = np.zeros((grid_size,grid_size))
    for row in range(grid_size):
        for col in range(grid_size):
            action_list = list()
            for idx_action, action in enumerate(actions):
                state = (row,col)
                next_state, action_str, reward, done = env.step(state, action)
                value = reward + gamma * post_value_table[next_state]
                action_list.append(value)
            next_value_table[state] = max(action_list)
    post_value_table = next_value_table
    print(post_value_table)

[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]
[[ 0. -1. -2. -2.]
 [-1. -2. -2. -2.]
 [-2. -2. -2. -1.]
 [-2. -2. -1.  0.]]
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
