## GridWorldEnvironment
```py
start_point = (0,0)
end_point = (4,4)
gridworld_size = (5,5)
env = GridWorldEnvironment(start_point, end_point, grid_world_size)
```

## Original Code

In [57]:
import numpy as np
from typing import Tuple

class GridWorldEnvironment:
    def __init__(self, start_point:Tuple, end_point:Tuple, grid_world_size:Tuple):
        # 시작점과 끝점을 받는다.
        self.start_point = start_point
        self.end_point = end_point if end_point != (-1,-1) else (grid_world_size[0] + end_point[0],
                                                                 grid_world_size[1] + end_point[1])

        # 그리드 월드의 규격을 받는다.
        self.width, self.height = grid_world_size
        self.grid_world_size = grid_world_size

        # action dictionary
        self.action_space = ['up', 'down', 'left', 'right']
        self.num_actions = len(self.action_space)
        self.actions = {'up':(-1,0),
                        'down':(1,0),
                        'left':(0,-1),
                        'right':(0,1) }

        # 상태 : 좌표로 나타남
        self.traces = []

        # total states
        self.total_states = []
        for x in range(self.width):
            for y in range(self.height):
                self.total_states.append((x,y))

        # reward
        self.reward = np.zeros(shape=(self.height, self.width)).tolist()
        self.reward[end_point[0]][end_point[1]] = 1

    def render(self):
        # 그리드 월드의 상태를 출력한다.
        self.grid_world = np.full(shape=(self.height, self.width), fill_value=".").tolist()

        last_point = self.traces[-1] # 에이전트가 가장 마지막에 있었던 위치
        traces = list(set(self.traces)) # 중복된 값을 삭제하기 위함
        for trace in traces:
            self.grid_world[trace[0]][trace[1]] = "X"

        self.grid_world[self.start_point[0]][self.start_point[1]] = "S" # start point
        self.grid_world[self.end_point[0]][self.end_point[1]] = "G" # end point
        self.grid_world[last_point[0]][last_point[1]] = "A" # 현재 에이전트의 위치

        # string으로 출력한다.
        grid = ""

        for i in range(self.height):
            for j in range(self.width):
                grid += self.grid_world[i][j]+" "
            grid += "\n"

        print(grid)

    def get_reward(self, state, action_idx):
        next_state = self.state_after_action(state, action_idx)
        return self.reward[next_state[0]][next_state[1]]

    def state_after_action(self, state, action_idx:int):
        action = self.action_space[action_idx]
        row_movement, col_movement = self.actions[action]

        # action에 따라 에이전트 이동
        next_state = (state[0]+row_movement, state[1]+col_movement)
        next_state = self.check_boundary(next_state)

        return next_state

    def check_boundary(self, state):
        state = list(state)
        state[0] = (0 if state[0] < 0 else self.height - 1 if state[0] > self.height - 1 else state[0])
        state[1] = (0 if state[1] < 0 else self.width - 1 if state[1] > self.width - 1 else state[1])
        return tuple(state)

# Deep SARSA Class

In [38]:
env = GridWorldEnvironment(start_point=(0,0), #위 환경 클래스를 상속 받아 `env.render` 코드를 구현
                           end_point=(4,4),
                           grid_world_size=(5,5))

In [56]:
import numpy as np
import random
import copy
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

class MyDeepSARSAEnv(GridWorldEnvironment):
  def __init__(self, env, alpha=0.001, gamma=0.9, epsilon=0.1):
    super().__init__(env.start_point, env.end_point, env.grid_world_size)
    self.env = self
    self.alpha = alpha # learning rate
    self.gamma = gamma # discount (낮을수록 현재 보상을, 1에 가까울수록 미래 보상을 중요하게 여김)
    self.epsilon = epsilon # exploration rate (최적 기대값 외 다른 행동도 일정 확률로 선택, 탐험)
    self.state_size = 2 # 상태 크기, 2는 좌표크기
    self.action_size = len(self.action_space) # 행동 공간의 크기

    # Q-NW 구축
    self.model = Sequential() # 신경망 모델 순차적으로 쌓을 준비
    self.model.add(Dense(24, input_dim=self.state_size, activation='relu')) # FCL, input layer + hl 1
    self.model.add(Dense(24, activation='relu')) # hl 2
    self.model.add(Dense(self.action_size, activation='linear')) # output layer
    self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.alpha)) # 평균 제곱 오차로 손실함수 계산

  def choose_action(self, state): # 행동 선택 함수
    if np.random.rand() < self.epsilon: # 무작위(0~1) 수가 epsilon보다 작으면 탐험, 크면 활용
      return np.random.choice(range(len(self.action_space))) # 행동 랜덤하게 선택
    else:
      state = np.array(state).reshape(1, self.state_size)
      q_values = self.model.predict(state)[0]
      return np.argmax(q_values) # 최대 기대값의 행동 반환

  def learn(self, state, action, reward, next_state, next_action):
    # 상태-행동을 신경망의 입력 형태로 변환
    state = np.array(state).reshape(1, self.state_size)
    next_state = np.array(next_state).reshape(1, self.state_size)

    # 현재 상태, 다음 상태 Q_value 예측
    q_values = self.model.predict(state)[0]
    next_q_values = self.model.predict(next_state)[0]

    # Q_value 계산
    target = reward + self.gamma * next_q_values[next_action]

    # Q-value 업데이트
    q_values[action] = target

    # 신경망 학습
    self.model.fit(state, np.array([q_values]), epochs=1, verbose=0)

  def run_sarsa(self, episode = 15):
    for ep in range(episode):
      state = self.start_point
      self.env.traces = [state]
      action = self.choose_action(state)
      move_count = 0

      while True:
        reward = self.get_reward(state, action)
        next_state = self.state_after_action(state, action)
        next_action = self.choose_action(next_state)
        self.env.traces.append(next_state)

        self.learn(state, action, reward, next_state, next_action)
        state = next_state
        action = next_action

        move_count += 1

        if next_state == self.end_point:
          break

      print(f"Episode: {ep+1} complete, Moves: {move_count}")
      self.render()

## Main

- **Deep SARSA**를 이용해 그리드 월드 학습시키기  
- 학습 지표 시각화 (에피소드마다 에이전트의 이동 횟수 시각화)

In [60]:
env = GridWorldEnvironment(start_point=(0,0), #위 환경 클래스를 상속 받아 `env.render` 코드를 구현
                           end_point=(4,4),
                           grid_world_size=(5,5))

In [61]:
agent1 = MyDeepSARSAEnv(env)
agent1.run_sarsa()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

---

# Q-learning Class

In [62]:
env = GridWorldEnvironment(start_point=(0,0), #위 환경 클래스를 상속 받아 `env.render` 코드를 구현
                           end_point=(4,4),
                           grid_world_size=(5,5))

In [67]:
import numpy as np
import random
import copy

class MyQLearningEnv(GridWorldEnvironment):
  def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
    super().__init__(env.start_point, env.end_point, env.grid_world_size)
    self.env = self
    self.alpha = alpha # learning rate
    self.gamma = gamma # discount (낮을수록 현재 보상을, 1에 가까울수록 미래 보상을 중요하게 여김)
    self.epsilon = epsilon # exploration rate (최적 기대값 외 다른 행동도 일정 확률로 선택, 탐험)
    # Q 테이블 초기화 (상태-행동)
    self.Qtable = {}
    for state in self.total_states:
      for a in range(len(self.action_space)):
        self.Qtable[(state,a)] = 0.0

  def choose_action(self, state): # 행동 선택 함수
    if np.random.rand() < self.epsilon: # 무작위(0~1) 수가 epsilon보다 작으면 탐험, 크면 활용
      return np.random.choice(range(len(self.action_space))) # 행동 랜덤하게 선택
    else:
      qs = [self.Qtable[(state,a)] for a in range(len(self.action_space))]
      return np.argmax(qs) # 최대 기대값의 행동 반환

  def learn(self, state, action, reward, next_state): # next_action 제거
    # Q값 업데이트 (Q-Learning)
    current_q = self.Qtable[(state, action)]
    # SARSA와 달리 next_action에 상관없이 next_state에서 최대 Q값을 가지는 action을 선택
    next_max_q = np.max([self.Qtable[(next_state, a)] for a in range(len(self.action_space))])  # next_action 제거
    self.Qtable[(state, action)] += self.alpha * (reward + self.gamma * next_max_q - current_q)

  def run_qlearning(self, episode = 15):
    for ep in range(episode):
      state = self.start_point
      self.env.traces = [state]
      move_count = 0

      while True:
        action = self.choose_action(state) # 현재 상태에서 행동 선택
        reward = self.get_reward(state, action)
        next_state = self.state_after_action(state, action)
        self.env.traces.append(next_state)

        self.learn(state, action, reward, next_state)  # next_action 제거

        state = next_state

        move_count += 1

        if next_state == self.end_point:
          break

      print(f"Episode: {ep+1} complete, Moves: {move_count}")
      self.render()

## Main


- **Q-learning**를 이용해 그리드 월드 학습시키기  
- 학습 지표 시각화 (에피소드마다 에이전트의 이동 횟수 시각화)

In [68]:
env2 = GridWorldEnvironment(start_point=(0,0), #위 환경 클래스를 상속 받아 `env.render` 코드를 구현
                           end_point=(4,4),
                           grid_world_size=(5,5))

In [69]:
agent1 = MyQLearningEnv(env2)
agent1.run_qlearning()

Episode: 1 complete, Moves: 1546546
S X X X X 
X X X X X 
X X X X X 
X X X X X 
. . . X A 

Episode: 2 complete, Moves: 84805
S X X X X 
X X X X X 
X X X X X 
. . . X . 
. . . X A 

Episode: 3 complete, Moves: 21463
S X X X X 
X X X X X 
. X X X X 
. . . X . 
. . . X A 

Episode: 4 complete, Moves: 5834
S X X X X 
X X X X X 
. . X X . 
. . . X . 
. . . X A 

Episode: 5 complete, Moves: 398
S X X X X 
X X X X . 
. . X X . 
. . . X . 
. . . X A 

Episode: 6 complete, Moves: 294
S X X X . 
X X . X . 
. . . X . 
. . . X . 
. . . X A 

Episode: 7 complete, Moves: 59
S X X X . 
X . . X . 
. . . X . 
. . . X . 
. . . X A 

Episode: 8 complete, Moves: 71
S X X X X 
X . . X X 
. . . X . 
. . . X . 
. . . X A 

Episode: 9 complete, Moves: 12
S X X X . 
. . X X . 
. . . X . 
. . . X . 
. . . X A 

Episode: 10 complete, Moves: 8
S X X X . 
. . . X . 
. . . X . 
. . . X . 
. . . X A 

Episode: 11 complete, Moves: 9
S X X X . 
. . . X . 
. . . X . 
. . . X . 
. . . X A 

Episode: 12 complete, Moves: