In [4]:
import sys
sys.path.append('../')
from deep_rl import *
from deep_rl.network.network_utils import *

class SF_FCBody(nn.Module):
    def __init__(self, state_dim, hidden_units=(400, 300), gate=F.relu, noisy_linear=False):
        super(SF_FCBody, self).__init__()
        dims = (state_dim, ) + hidden_units
        if noisy_linear:
            self.layers = nn.ModuleList(
                [NoisyLinear(dim_in, dim_out) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
        else:
            self.layers = nn.ModuleList(
                [layer_init(nn.Linear(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
        
        self.gate = gate
        self.feature_dim = dims[-1]
        self.noisy_linear = noisy_linear

    def reset_noise(self):
        if self.noisy_linear:
            for layer in self.layers:
                layer.reset_noise()
    
    def forward(self, x):
        for layer in self.layers:
            x = self.gate(layer(x))
        return x

In [None]:
class critic(nn.Module)

In [5]:
model = SF_FCBody(11)

In [8]:
data = np.zeros((20, 11))
data = tensor(data)
model(data).shape

torch.Size([20, 300])

In [31]:
x = torch.tensor(torch.arange(4.0), requires_grad=True)

  """Entry point for launching an IPython kernel.


In [32]:
loss = torch.dot(x, x)
loss1 = torch.dot(x, x) * 2

In [33]:
loss.backward()
x.grad

tensor([0., 2., 4., 6.])

In [34]:
loss1.backward()
x.grad

tensor([ 0.,  6., 12., 18.])

In [35]:
class DeterministicActorCriticNet(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 actor_opt_fn,
                 critic_opt_fn,
                 phi_body=None,
                 actor_body=None,
                 critic_body=None):
        super(DeterministicActorCriticNet, self).__init__()
        if phi_body is None: phi_body = DummyBody(state_dim)
        if actor_body is None: actor_body = DummyBody(phi_body.feature_dim)
        if critic_body is None: critic_body = DummyBody(phi_body.feature_dim)
        self.phi_body = phi_body
        self.actor_body = actor_body
        self.critic_body = critic_body
        self.fc_action = layer_init(nn.Linear(actor_body.feature_dim, action_dim), 1e-3)
        self.fc_critic = layer_init(nn.Linear(critic_body.feature_dim, 1), 1e-3)

        self.actor_params = list(self.actor_body.parameters()) + list(self.fc_action.parameters())
        self.critic_params = list(self.critic_body.parameters()) + list(self.fc_critic.parameters())
        self.phi_params = list(self.phi_body.parameters())
        
        self.actor_opt = actor_opt_fn(self.actor_params + self.phi_params)
        self.critic_opt = critic_opt_fn(self.critic_params + self.phi_params)
        self.to(Config.DEVICE)

    def forward(self, obs):
        phi = self.feature(obs)
        action = self.actor(phi)
        return action

    def feature(self, obs):
        obs = tensor(obs)
        return self.phi_body(obs)

    def actor(self, phi):
        return torch.tanh(self.fc_action(self.actor_body(phi)))

    def critic(self, phi, a):
        return self.fc_critic(self.critic_body(torch.cat([phi, a], dim=1)))


In [36]:
DDPG = DeterministicActorCriticNet(
        11, 3,
        actor_body=FCBody(11, (400, 300), gate=F.relu),
        critic_body=FCBody(11 + 3, (400, 300), gate=F.relu),
        actor_opt_fn=lambda params: torch.optim.Adam(params, lr=1e-3),
        critic_opt_fn=lambda params: torch.optim.Adam(params, lr=1e-3))


In [None]:
states = tensor(np.random(1, 11))
actions = tensor(np.random(1, 3))
rewards = tensor(np.random(1, 1))
next_states = tensor(np.random(1, 11))
mask = tensor(np.array([False]).reshape(1, -1))


# need to change the following

# this part has no update effect (has gradient, but we do not directly update target network)
phi_next = self.target_network.feature(next_states)
a_next = self.target_network.actor(phi_next)
q_next = self.target_network.critic(phi_next, a_next)


q_next = config.discount * mask * q_next
q_next.add_(rewards)
q_next = q_next.detach()


phi = self.network.feature(states)
q = self.network.critic(phi, actions)
critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean()

self.network.zero_grad()
critic_loss.backward()
self.network.critic_opt.step()

phi = self.network.feature(states)
action = self.network.actor(phi)
policy_loss = -self.network.critic(phi.detach(), action).mean()

self.network.zero_grad()
policy_loss.backward()
self.network.actor_opt.step()

In [None]:
state = tensor(np.zeros(1, 11))
action = tensor(np.zeros(1, ))
reward = 10

In [37]:
x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
out = x.pow(2).sum()
out.backward()
x.grad

tensor([[ 2., -2.],
        [ 2.,  2.]])

In [39]:
# x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
out = x.pow(2).sum()

out2 = out * 2
out2.backward()
x.grad

tensor([[ 10., -10.],
        [ 10.,  10.]])