# DDPG

Задаем структуру аппроксимаций $\pi^\eta(s)$, $Q^\theta(s,a)$ и начальные вектора параметров $\eta$, $\theta$.

Для каждого эпизода делаем:

   Пока эпизод не закончен делаем:

- Находясь в состоянии $S_t$ совершаем действие

    $$
    A_t = \pi^\eta(S_t) + Noise,
    $$

    получаем награду $R_t$  переходим в состояние $S_{t+1}$. Сохраняем 
    $(S_t,A_t,R_t,D_t,S_{t+1}) \Rightarrow Memory$


- Берем $\{(s_i,a_i,r_i,d_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, определяем значения

    $$
    y_i = r_i + (1 - d_i) \gamma Q^\theta(s'_i,\pi^\eta(s'_i))
    $$
    функции потерь

    $$
    Loss_1(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2,\quad Loss_2(\eta) = -\frac{1}{n}\sum\limits_{i=1}^n Q^\theta(s_i,\pi^\eta(s_i))
    $$

    и обновляем вектор параметров

    $$
    \theta \leftarrow \theta - \alpha \nabla_\theta Loss_1(\theta),\quad \eta \leftarrow \eta - \beta \nabla_\eta Loss_2(\eta),\quad \alpha,\beta > 0
    $$

- Уменьшаем $Noise$


In [1]:
import numpy as np

import torch
import torch.nn as nn

import random
from collections import deque
from copy import deepcopy

In [2]:
# Ornstein-Uhlenbeck process

class OUNoise:
    def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.3):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu
        
    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [32]:
class Net(nn.Module):
    def __init__(self, input_dim, layer1_dim, layer2_dim, output_dim, output_tanh):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(input_dim, layer1_dim)
        self.layer2 = nn.Linear(layer1_dim, layer2_dim)
        self.layer3 = nn.Linear(layer2_dim, output_dim)
        self.relu = nn.ReLU(True)
        self.tanh = nn.Tanh()
        self.output_tanh = output_tanh
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.layer3(x)
        
        if self.output_tanh:
            return self.tanh(x)
        else:
            return x
        
        
class DDPG():
    def __init__(self, state_dim, action_dim, action_scale, noise_decrease, 
                 gamma=0.99, batch_size=64, q_lr=1e-3, pi_lr=1e-4, tau=1e-2, memory_size=100000):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
                
        self.action_scale = action_scale
        
        self.pi_model = Net(self.state_dim, 400, 300, self.action_dim, output_tanh=True)
        self.q_model = Net(self.state_dim + self.action_dim, 400, 300, 1, output_tanh=False)
        self.pi_target_model = deepcopy(self.pi_model)
        self.q_target_model = deepcopy(self.q_model)
        
        
        self.noise = OUNoise(self.action_dim)
        self.noise_treshold = 1
        self.noise_decrease = noise_decrease
        
        self.tau = tau
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.q_optimizer = torch.optim.Adam(self.q_model.parameters(), lr=q_lr)
        self.pi_optimizer = torch.optim.Adam(self.q_model.parameters(), lr=pi_lr)
        
        
    def get_action(self, state):
        pred_action = self.pi_model(torch.FloatTensor(state)).detach().numpy()
        action = self.action_scale * (pred_action + self.noise_treshold * self.noise.sample())
        return np.clip(action, -self.action_scale, self.action_scale)
    
    def update_target_model(self, target_model, model, optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        for target_param, param in zip(target_model.parameters(), model.parameters()):
            target_param.data.copy_((1 - self.tau) * target_param.data + self.tau * param.data)
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])
        
        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
            rewards = rewards.reshape(self.batch_size, 1)
            dones = dones.reshape(self.batch_size, 1)
            
            pred_next_actions = self.action_scale * self.pi_target_model(next_states)
            next_states_and_pred_next_actions = torch.cat((next_states, pred_next_actions), dim=1)
            targets = reward + self.gamma * (1 - dones) * self.q_target_model(next_states_and_pred_next_actions)
            
            states_and_actions = torch.cat((states, actions), dim=1)
            temp = (self.q_model(states_and_actions) - targets.detach())
            q_loss = torch.mean((targets.detach() - self.q_model(states_and_actions)) ** 2)
            self.update_target_model(self.q_target_model, self.q_model, self.q_optimizer, q_loss)
            
            pred_actions = self.action_scale * self.pi_model(states)
            states_and_pred_actions = torch.cat((states, pred_actions), dim=1)
            pi_loss = - torch.mean(self.q_model(states_and_pred_actions))
            self.update_target_model(self.pi_target_model, self.pi_model, self.pi_optimizer, pi_loss)
            
            
        if self.noise_treshold > 0:
            self.noise_trashold = max(0, self.noise_treshold - self.noise_decrease)
            

In [33]:
import gym

env = gym.make("Pendulum-v1")

state_dim = 3
action_dim = 1
action_scale = 2


episode_n = 200
trajectory_len = 200

noise_decrease = 1 / (episode_n * trajectory_len)

agent = DDPG(state_dim, action_dim, action_scale, noise_decrease)


for episode in range(episode_n):
    
    total_reward = 0
    state = env.reset()
    for _ in range(trajectory_len):
        
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        
        agent.fit(state, action, reward, done, next_state)
        total_reward += reward
        if done:
            break
            
        state = next_state
    print(f'{episode = } \t {total_reward = }')

  deprecation(
  deprecation(


episode = 0 	 total_reward = -1243.5950399682877
episode = 1 	 total_reward = -997.3028878367038
episode = 2 	 total_reward = -1236.9298215834053
episode = 3 	 total_reward = -776.8356955464615
episode = 4 	 total_reward = -1109.8703457349945
episode = 5 	 total_reward = -1196.0624739765751
episode = 6 	 total_reward = -1184.3271069733041
episode = 7 	 total_reward = -1014.8327159695363
episode = 8 	 total_reward = -1195.8955020200124
episode = 9 	 total_reward = -1747.2333444486503
episode = 10 	 total_reward = -1080.580981253478
episode = 11 	 total_reward = -1219.1792376183548
episode = 12 	 total_reward = -1409.2831809868342
episode = 13 	 total_reward = -1169.6283520749369
episode = 14 	 total_reward = -1389.2537090027438
episode = 15 	 total_reward = -1451.9970700056485
episode = 16 	 total_reward = -1267.369915679609
episode = 17 	 total_reward = -1444.8755375650105
episode = 18 	 total_reward = -1135.8981503020616
episode = 19 	 total_reward = -1105.7439530699992
episode = 20 	