# DQN

Задаем структуру аппроксимации $Q^\theta$, начальные вектор параметров $\theta$, вероятность исследования среды $\varepsilon = 1$.

Для каждого эпизода $k$ делаем:

Пока эпизод не закончен делаем:

- Находясь в состоянии $S_t$ совершаем действие $A_t \sim \pi(\cdot|S_t)$, где $\pi = \varepsilon\text{-greedy}(Q^\theta)$, получаем награду $R_t$  переходим в состояние $S_{t+1}$. Сохраняем $(S_t,A_t,R_t,S_{t+1}) \rightarrow Memory$


- Берем $\{(s_i,a_i,r_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, определяем целевые значения

$$
y_i =
\left\{
\begin{array}{ll}
r_i, &\text{ если } s'_i\text{ -терминальное},\\[0.0cm]
 r_i + \gamma \max\limits_{a'} Q^\theta(s'_i,a'), &\text{ иначе}
\end{array}
\right.
$$

функцию потерь $Loss(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2$
и обновляем вектор параметров

$$
\theta \leftarrow \theta - \alpha \nabla_\theta Loss(\theta)
$$

- Уменьшаем $\varepsilon$


In [8]:
import numpy as np
import random
import torch
import torch.nn as nn

class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.linear_1 = nn.Linear(state_dim, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, action_dim)
        self.activation = nn.ReLU()

    def forward(self, states):
        hidden = self.linear_1(states)
        hidden = self.activation(hidden)
        hidden = self.linear_2(hidden)
        hidden = self.activation(hidden)
        actions = self.linear_3(hidden)
        return actions

In [53]:
class DQN:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimzaer = torch.optim.Adam(self.q_function.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))
    
            targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function(next_states), dim=1).values
            q_values = self.q_function(states)[torch.arange(self.batch_size), actions]
            
            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimzaer.step()
            self.optimzaer.zero_grad()
            
            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [54]:
import gym

env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQN(state_dim, action_dim)

episode_n = 100
t_max = 500

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward
        
        agent.fit(state, action, reward, done, next_state)

        state = next_state

        if done:
            break

    print(f'episode: {episode}, total_reward: {total_reward}')

episode: 0, total_reward: 23.0
episode: 1, total_reward: 22.0
episode: 2, total_reward: 25.0
episode: 3, total_reward: 35.0
episode: 4, total_reward: 10.0
episode: 5, total_reward: 16.0
episode: 6, total_reward: 11.0
episode: 7, total_reward: 10.0
episode: 8, total_reward: 10.0
episode: 9, total_reward: 10.0
episode: 10, total_reward: 9.0
episode: 11, total_reward: 10.0
episode: 12, total_reward: 9.0
episode: 13, total_reward: 8.0
episode: 14, total_reward: 8.0
episode: 15, total_reward: 9.0
episode: 16, total_reward: 9.0
episode: 17, total_reward: 10.0
episode: 18, total_reward: 9.0
episode: 19, total_reward: 9.0
episode: 20, total_reward: 10.0
episode: 21, total_reward: 9.0
episode: 22, total_reward: 8.0
episode: 23, total_reward: 9.0
episode: 24, total_reward: 24.0
episode: 25, total_reward: 26.0
episode: 26, total_reward: 33.0
episode: 27, total_reward: 92.0
episode: 28, total_reward: 141.0
episode: 29, total_reward: 75.0
episode: 30, total_reward: 51.0
episode: 31, total_reward: 3