In [1]:
import sys; sys.path.append('..') # add project root to the python path

In [2]:
import gym
import torch

from src.part3.MLP import MultiLayerPerceptron as MLP
from src.part4.ActorCritic import TDActorCritic
from src.common.train_utils import EMAMeter, to_tensor

In [3]:
env = gym.make('CartPole-v1')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n



# TD Actor-critic

이번 실습에는 Vanilla version 의 TD Actor-critic을 만들어볼까요? 
> TD actor-critic은 Advantage function $A(s,a)$을 $V_{\psi}(s)$ 활용해서 추산하고 그 값을 리턴대신 활용해서 Policy gradient 를 계산하는 기법인거 잊지 않으셨죠?
> $$A(s,a) \approx \delta_\psi(s,a) = r+\gamma V_\psi(s')-V(s)$$

`TD Actor-critic`의 의사 코드는 다음과 같습니다.

<img src="./images/TDAC.png" width="60%" height="40%" title="TDAC" alt="TDAC"></img>

파이썬으로 구현한 `TD Actor-critic` 은 어떻게 생겼을까요?

```python
class TDActorCritic(nn.Module):

    def __init__(self,
                 policy_net,
                 value_net,
                 gamma: float = 1.0,
                 lr: float = 0.0002):
        super(TDActorCritic, self).__init__()
        self.policy_net = policy_net
        self.value_net = value_net
        self.gamma = gamma
        self.lr = lr

        # use shared optimizer
        total_param = list(policy_net.parameters()) + list(value_net.parameters())
        self.optimizer = torch.optim.Adam(params=total_param, lr=lr)

        self._eps = 1e-25
        self._mse = torch.nn.MSELoss()
        
    def get_action(self, state):
        with torch.no_grad():
            logits = self.policy(state)
            dist = Categorical(logits=logits)
            a = dist.sample()  # sample action from softmax policy
        return a
```

가장 중요한 업데이트는 어떻게 생겼을까요?

```python
    def update(self, state, action, reward, next_state, done):
        # compute targets
        with torch.no_grad():
            td_target = reward + self.gamma * self.value_net(next_state) * (1-done)
            td_error = td_target - self.value_net(state)

        # compute log probabilities
        dist = Categorical(logits=self.policy_net(state))
        prob = dist.probs.gather(1, action)

        # compute the values of current states
        v = self.value_net(state)

        loss = -torch.log(prob + self._eps) * td_error + self._mse(v, td_target)
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
```

In [9]:
policy_net = MLP(s_dim, a_dim, [128])
value_net = MLP(s_dim, 1, [128])

agent = TDActorCritic(policy_net, value_net)
ema = EMAMeter()

In [11]:
n_eps = 10000
print_every = 500

for ep in range(n_eps):
    s = env.reset()
    cum_r = 0

    while True:
        s = to_tensor(s, size=(1, 4))
        a = agent.get_action(s)
        ns, r, done, info = env.step(a.item())
        
        ns = to_tensor(ns, size=(1,4))
        agent.update(s, a, r, ns, done)
        
        s = ns.numpy()
        cum_r += r
        if done:
            break

    ema.update(cum_r)
    if ep % print_every == 0:
        print("Episode {} || EMA: {} ".format(ep, ema.s))

    

RuntimeError: invalid argument 4: Index tensor must have same dimensions as input tensor at C:\w\1\s\tmp_conda_3.7_055457\conda\conda-bld\pytorch_1565416617654\work\aten\src\TH/generic/THTensorEvenMoreMath.cpp:453