In [88]:
import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython

In [105]:
class GridWorld :
    def __init__(self) :
        self.a2d = {
            0: np.array([0,1]),  # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }

        self.states_space = gym.spaces.MultiDiscrete([4, 4])
        self.state = np.array([0, 0])
        self.reward = None
        self.terminated = False

    def step(self, action) :
        self.state = self.state + self.a2d[action] ## 여기서 깊은복사 이슈 나는듯
        s1, s2 = self.state

        if (s1 == 3) and (s2 == 3) :
            self.reward = 100
            self.terminated = True
        
        elif self.state in self.states_space :
            self.reward = -1

        else :
            self.reward = -10
            self.terminated = True

        return self.state, self.reward, self.terminated

    def reset(self) :
        self.state = np.array([0, 0])
        self.terminated = False
        
        return self.state


class RandomAgent :
    def __init__(self) :
        self.state = np.array([0, 0])
        self.action = None
        self.reward = None
        self.next_state = None
        self.terminated = None

        self.states = collections.deque(maxlen = 500000)
        self.actions = collections.deque(maxlen = 500000)
        self.rewards = collections.deque(maxlen = 500000)
        self.next_states = collections.deque(maxlen = 500000)
        self.terminations = collections.deque(maxlen = 500000)

        self.action_space = gym.spaces.Discrete(4)
        self.n_experience = 0

    def act(self) :
        """
        Before Learning : Random Action
        """
        self.action = self.action_space.sample()

    def save_experience(self) :
        self.states.append(self.state)
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.next_states.append(self.next_state)
        self.terminations.append(self.terminated)
        self.n_experience += 1

    def learn(self) :
        pass

In [106]:
player = RandomAgent()
env = GridWorld()

scores = []
score = 0


for e in range(1, 100000) :
    #---에피소드 시작---#
    while True :
        ## step 1 : Choice Action
        player.act()
        ## step 2 : Environment
        player.next_state, player.reward, player.terminated = env.step(player.action)
        ## step 3 : Save & Learn
        player.save_experience()
        player.learn()
        ## step 4 : Exit
        score += player.reward

        if player.terminated :
            score = 0
            scores.append(score)
            player.state = env.reset()
            break
        else :
            player.state = player.next_state

In [107]:
player.n_experience

327434

In [108]:
q_table = np.zeros([4, 4, 4])
count = np.zeros([4, 4, 4])

In [109]:
for (s1, s2), a, r in zip(player.states, player.actions, player.rewards) :
    q_table[s1, s2, a] += r
    count[s1, s2, a] += 1

count[count == 0] = 1e-6

In [110]:
q_table = q_table/count

In [111]:
q_table[:, :, 3]

array([[-10., -10., -10., -10.],
       [ -1.,  -1.,  -1.,  -1.],
       [ -1.,  -1.,  -1.,  -1.],
       [ -1.,  -1.,  -1.,   0.]])

In [112]:
q_table = np.zeros([4, 4, 4])

for (s1, s2), a, r in zip(player.states, player.actions, player.rewards) :
    qhat = q_table[s1, s2, a]
    q = r
    diff = q - qhat

    q_table[s1, s2, a] += 0.01*diff

In [113]:
q_table[:, :, 0].round(2)

array([[ -1.  ,  -1.  ,  -1.  , -10.  ],
       [ -1.  ,  -1.  ,  -1.  , -10.  ],
       [ -1.  ,  -1.  ,  -1.  , -10.  ],
       [ -1.  ,  -1.  ,  99.99,   0.  ]])

In [114]:
def act(s1, s2) :
    action = q_table[s1, s2].argmax()
    return action

In [115]:
score = 0

for t in range(1, 50) :
    ## step1 : Action
    s1, s2 = player.state
    action = act(s1, s2)
    ## step2 : Environment
    player.next_state, player.reward, player.terminated = env.step(action)
    ## step3 : Save & Learn
    player.save_experience()
    player.learn()
    ## step4 : Terminate
    score += player.reward
    
    if player.terminated :
        player.state = env.reset()
        break
    else :
        player.state = player.next_state

In [116]:
states = np.concat([np.array(player.states)[-t:], np.array([[3, 3]])], axis = 0)

In [117]:
np.array(player.next_states)[-t:]

array([[0, 1],
       [0, 2],
       [0, 3],
       [1, 3],
       [2, 3],
       [3, 3]])

In [118]:
player = RandomAgent()
env = GridWorld()

scores = []
score = 0


for e in range(1, 100000) :
    #---에피소드 시작---#
    while True :
        ## step 1 : Choice Action
        player.act()
        ## step 2 : Environment
        player.next_state, player.reward, player.terminated = env.step(player.action)
        ## step 3 : Save & Learn
        player.save_experience()
        player.learn()
        ## step 4 : Exit
        score += player.reward
        scores.append(score)

        if player.terminated :
            score = 0
            player.state = env.reset()
            break
        else :
            player.state = player.next_state

In [119]:
q_table = np.zeros([4, 4, 4])
memory = zip(player.states, player.actions, player.rewards, player.next_states, player.terminations)

for (s1, s2), a, r, (ss1, ss2), tmd in memory :
    qhat = q_table[s1, s2, a]

    if tmd :
        ## 현재 받는 rewards만 고려 -> 상황이 종결됐으므로
        q = r
    else :
        ## 다음 스텝으로 이동할 수 있음 -> 미래의 리워드에 감가율을 반영하여 갱신
        future_r = q_table[ss1, ss2, :].max()
        q = r + 0.95*future_r

    diff = q-qhat
    q_table[s1, s2, a] += 0.01*diff

In [122]:
q_table[..., 0]

array([[72.84046113, 77.72535187, 82.81528114, -9.99998994],
       [77.72722748, 82.87191663, 88.27943489, -9.9999946 ],
       [82.86988359, 88.28812443, 93.99093109, -9.99927194],
       [88.24662348, 93.98300562, 99.99348144,  0.        ]])

In [127]:
player.q_table = q_table ## __init__에서 선언 안해도 변수 추가할 수 있나봄

def act(player, s1, s2) :
    action = player.q_table[s1, s2, :].argmax()
    return action

In [131]:
score = 0

for t in range(1, 50) :
    ## step1 : Action
    s1, s2 = player.state
    action = act(player, s1, s2)
    ## step2 : Environment
    player.next_state, player.reward, player.terminated = env.step(action)
    ## step3 : Save & Learn
    player.save_experience()
    player.learn()
    ## step4 : Terminate
    score += player.reward
    
    if player.terminated :
        player.state = env.reset()
        break
    else :
        player.state = player.next_state

In [132]:
def show(states):
    fig = plt.Figure()
    ax = fig.subplots()
    ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
    sc = ax.scatter(0, 0, color='red', s=500)  
    ax.text(0, 0, 'start', ha='center', va='center')
    ax.text(3, 3, 'end', ha='center', va='center')
    # Adding grid lines to the plot
    ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
    ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
    ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
    state_space = gym.spaces.MultiDiscrete([4,4])
    def update(t):
        if states[t] in state_space:
            s1,s2 = states[t]
            states[t] = [s2,s1]
            sc.set_offsets(states[t])
        else:
            s1,s2 = states[t]
            s1 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
            s2 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
            states[t] = [s2,s1]       
            sc.set_offsets(states[t])
    ani = FuncAnimation(fig,update,frames=len(states))
    display(IPython.display.HTML(ani.to_jshtml()))

In [134]:
states = np.concat([np.array([[0, 0]]), np.array(player.next_states)[-t:]], axis = 0)
show(states)