In [1]:
import numpy as np

In [2]:
class Environment:
    def __init__(self):
        self.value_table = np.zeros((5, 5))

    def update(self):
        self.next_value_table = self.value_table.copy()
        for row in range(5):
            for col in range(5):
                self.next_value_table[row, col] = 0
                for action in range(4):
                    next_row, next_col, reward = self.step((row, col), action)
                    self.next_value_table[row, col] += 0.25 * (reward + 0.9 * self.value_table[next_row, next_col])
        self.value_table = self.next_value_table.copy()

    def step(self, state:tuple[int, int], action)->tuple[int, int, int]: # (next_row, next_col), reward

        if state[0] == 0 and state[1] == 1:
            next_row, next_col = 4, 1
            reward = 10

        elif state[0] == 0 and state[1] == 3:
            next_row, next_col = 2, 3
            reward = 5

        else:  # 일반적인 경우
            if action == 0:
                next_row, next_col = state[0] - 1, state[1]
            elif action == 1:
                next_row, next_col = state[0] + 1, state[1]
            elif action == 2:
                next_row, next_col = state[0], state[1] - 1
            else: # elif action == 3:
                next_row, next_col = state[0], state[1] + 1

            # 그리드 밖으로 나가는 경우
            if next_row < 0 or next_row >= 5 or next_col < 0 or next_col >= 5:
                reward = -1
                next_row, next_col = state[0], state[1]
            else:
                reward = 0

        return next_row, next_col, reward
    
    def get_poilcy(self):
        dir_table = {0: "↑", 1: "↓", 2: "←", 3: "→"}
        direction_list = []
        for row in range(5):
            direction_list_row = []
            for col in range(5):
                if row == 0 and col == 1 or row == 0 and col == 3:
                    direction_list_row.append("*")
                    continue
                action_value = []
                for action in range(4):
                    if action == 0:
                        try:
                            action_value.append(self.value_table[row-1, col])
                        except IndexError:
                            action_value.append(-999)
                    elif action == 1:
                        try:
                            action_value.append(self.value_table[row+1, col])
                        except IndexError:
                            action_value.append(-999)
                    elif action == 2:
                        try:
                            action_value.append(self.value_table[row, col-1])
                        except IndexError:
                            action_value.append(-999)
                    else: # elif action == 3:
                        try:
                            action_value.append(self.value_table[row, col+1])
                        except IndexError:
                            action_value.append(-999)

                direction_list_row.append(dir_table[np.argmax(action_value)])
            direction_list.append(direction_list_row)
        return direction_list

In [3]:
env = Environment()
print("초기 상태 가치 함수")
print(env.value_table)

for i in range(100):
    env.update()

print("\n무작위 행동으로 수렴된 상태 가치 함수 (100번 반복)")
print(env.value_table)

print("\n상태 가치 함수로 계산한 정책")
print(np.array(env.get_poilcy()))

초기 상태 가치 함수
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

무작위 행동으로 수렴된 상태 가치 함수 (100번 반복)
[[ 3.30899932  8.78929484  4.42762216  5.32237057  1.49218174]
 [ 1.52159105  2.99232084  2.25014293  1.90757468  0.54740569]
 [ 0.05082547  0.73817357  0.67311624  0.35818919 -0.40313816]
 [-0.97358932 -0.43549245 -0.35487929 -0.58560211 -1.1830721 ]
 [-1.85769757 -1.34522828 -1.22926428 -1.42291517 -1.97517607]]

상태 가치 함수로 계산한 정책
[['→' '*' '←' '*' '←']
 ['↑' '↑' '↑' '↑' '←']
 ['↑' '↑' '↑' '↑' '↑']
 ['↑' '↑' '↑' '↑' '↑']
 ['↑' '↑' '↑' '↑' '↑']]


In [4]:
class Environment2:
    def __init__(self):
        self.q_table = np.zeros((5, 5, 4))

    def update(self):
        self.next_q_table = self.q_table.copy()
        for row in range(5):
            for col in range(5):
                for action in range(4):
                    (next_row, next_col), reward = self.get_next_state(row, col, action)
                    self.next_q_table[row, col, action] = (reward + 0.9 * np.max(self.q_table[next_row, next_col, :]))
        self.q_table = self.next_q_table.copy()

    def get_next_state(self, row, col, action):

        if row == 0 and col == 1:
            next_row, next_col = 4, 1
            reward = 10

        elif row == 0 and col == 3:
            next_row, next_col = 2, 3
            reward = 5

        else:  # 일반적인 경우
            if action == 0:
                next_row, next_col = row - 1, col
            elif action == 1:
                next_row, next_col = row + 1, col
            elif action == 2:
                next_row, next_col = row, col - 1
            else: # elif action == 3:
                next_row, next_col = row, col + 1

            # 그리드 밖으로 나가는 경우
            if next_row < 0 or next_row >= 5 or next_col < 0 or next_col >= 5:
                reward = -1
                next_row, next_col = row, col
            else:
                reward = 0

        return (next_row, next_col), reward
    
    def get_poilcy(self):
        dir_dict = {0: "↑", 1: "↓", 2: "←", 3: "→"}
        direction_list = []
        for row in range(5):
            direction_list_row = []
            for col in range(5):
                if row == 0 and col == 1 or row == 0 and col == 3:
                    direction_list_row.append("* ")
                    continue
                action_value = ""
                
                # 최대값을 가지는 행동의 모든 인덱스를 가져옴
                max_idx_list = np.argwhere(self.q_table[row, col, :] == np.max(self.q_table[row, col, :])).flatten().tolist()

                for i in max_idx_list:
                    action_value += dir_dict[i]
                for i in range(2-len(max_idx_list)):
                    action_value += " "

                direction_list_row.append(action_value)

            direction_list.append(direction_list_row)
        return direction_list

In [5]:
env = Environment2()
print("초기 상태 가치 함수")
print(np.max(env.q_table, axis=2))

for i in range(100):
    env.update()

print("\n최적 행동 가치 함수로 계산한 최적 상태 가치 함수  (100번 반복)")
print(np.max(env.q_table, axis=2))

print("\n최적 행동 가치 함수로 계산한 정책")
print(np.array(env.get_poilcy()))

초기 상태 가치 함수
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

최적 행동 가치 함수로 계산한 최적 상태 가치 함수  (100번 반복)
[[21.97690153 24.41877948 21.97690153 19.41877948 17.47690153]
 [19.77921138 21.97690153 19.77921138 17.80129024 16.02116122]
 [17.80129024 19.77921138 17.80129024 16.02116122 14.41877948]
 [16.02116122 17.80129024 16.02116122 14.41877948 12.97690153]
 [14.41877948 16.02116122 14.41877948 12.97690153 11.67921138]]

최적 행동 가치 함수로 계산한 정책
[['→ ' '* ' '← ' '* ' '← ']
 ['↑→' '↑ ' '↑←' '← ' '← ']
 ['↑→' '↑ ' '↑←' '↑←' '↑←']
 ['↑→' '↑ ' '↑←' '↑←' '↑←']
 ['↑→' '↑ ' '↑←' '↑←' '↑←']]
