## DQN

В данном пункте мы будем использовать библиотеку pytorch для обучения нейронной сети, хотя можно использовать и любую другую библиотеку.

In [1]:
try:
    import google.colab
    COLAB = True
except ModuleNotFoundError:
    COLAB = False
    pass

if COLAB:
    !pip -q install "gymnasium[classic-control, atari, accept-rom-license]"
    !pip -q install piglet
    !pip -q install imageio_ffmpeg
    !pip -q install moviepy==1.0.3

In [2]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'gymnasium'

<img src="https://www.researchgate.net/publication/362568623/figure/fig5/AS:1187029731807278@1660021350587/Screen-capture-of-the-OpenAI-Gym-CartPole-problem-with-annotations-showing-the-cart.png" />

In [None]:
env = gym.make("CartPole-v1")
env.reset()[0]

Т.к. описание состояния в задаче с маятником представляет собой не "сырые" признаки, а уже предобработанные (координаты, углы), нам не нужна для начала сложная архитектура, начнем с такой:
<img src="https://raw.githubusercontent.com/Tviskaron/mipt/master/2020/RL/figures/DQN.svg">
Для начала попробуйте использовать только полносвязные слои (``torch.nn.Linear``) и простые активационные функции (``torch.nn.ReLU``).

Будем приближать Q-функцию агента, минимизируя среднеквадратичную TD-ошибку:
$$
\delta = Q_{\theta}(s, a) - [r(s, a) + \gamma \cdot max_{a'} Q_{-}(s', a')] \\
L = \frac{1}{N} \sum_i \delta_i^2,
$$
где
* $s, a, r, s'$ состояние, действие, вознаграждение и следующее состояние
* $\gamma$ дисконтирующий множитель.

Основная тонкость состоит в использовании $Q_{-}(s',a')$. Это та же самая функция, что и $Q_{\theta}$, которая является выходом нейронной сети, но при обучении сети, мы не пропускаем через эти слои градиенты. В статьях можно обнаружить следующее обозначение для остановки градиента: $SG(\cdot)$.

In [None]:
import torch
import torch.nn as nn
from collections import deque

In [None]:
env = gym.make("CartPole-v1")

env.reset()

n_actions = env.action_space.n
state_dim = env.observation_space.shape

print(f'Action_space: {n_actions} \nState_space: {env.observation_space.shape}')

env.close()

Задавайте небольшой размер скрытых слоев, например не больше 200.
Определяем граф вычислений:

In [None]:
# TODO: refactor hidden_dims and make it more clear (typing and so on)
def create_network(input_dim, hidden_dims, output_dim):
    network = nn.Sequential(
        nn.Linear(input_dim, hidden_dims[0]),
        nn.ReLU(),
        nn.Linear(hidden_dims[0], hidden_dims[1]),
        nn.ReLU(),
        nn.Linear(hidden_dims[1], output_dim),
    )
    return network

In [None]:
def select_action_eps_greedy(network, state, epsilon):
    """Выбирает действие epsilon-жадно."""
    if not isinstance(state, torch.Tensor):
        state = torch.tensor(state, dtype=torch.float32)
    Q_s = network(state).detach().numpy()
    if epsilon < np.random.random():
        action = np.argmax(Q_s)
    else:
        n_actions = Q_s.shape[-1]
        action = np.random.choice(n_actions)

    action = int(action)
    return action

In [None]:
def compute_td_loss(
        network, states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False, regularizer=.1
):
    """ Считатет td ошибку, используя лишь операции фреймворка torch. Используйте формулу выше. """

    # переводим входные данные в тензоры
    states = torch.tensor(np.array(states), dtype=torch.float32)    # shape: [batch_size, state_size]
    actions = torch.tensor(actions, dtype=torch.long)     # shape: [batch_size]
    rewards = torch.tensor(rewards, dtype=torch.float32)  # shape: [batch_size]


    next_states = torch.tensor(np.array(next_states), dtype=torch.float32) # shape: [batch_size, state_size]
    is_done = torch.tensor(is_done, dtype=torch.bool)    # shape: [batch_size]

    # получаем значения q для всех действий из текущих состояний
    predicted_qvalues = network(states)

    # получаем q-values для выбранных действий
    predicted_qvalues_for_actions = predicted_qvalues[range(states.shape[0]), actions]

    # применяем сеть для получения q-value для следующих состояний (next_states)
    predicted_next_qvalues = network(next_states)

    # вычисляем V*(next_states), что соответствует max_{a'} Q(s',a')
    next_state_values = torch.max(predicted_next_qvalues.detach(), axis=-1)[0]

    assert next_state_values.dtype == torch.float32

    # вычисляем target q-values для функции потерь
    target_qvalues_for_actions = rewards + gamma * next_state_values

    # для последнего действия в эпизоде используем
    # упрощенную формулу Q(s,a) = r(s,a),
    # т.к. s' для него не существует
    target_qvalues_for_actions = torch.where(is_done, rewards, target_qvalues_for_actions)

    losses = (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2

    # MSE loss для минимизации
    loss = torch.mean(losses)
    # добавляем регуляризацию на значения Q
    loss += regularizer * predicted_qvalues_for_actions.mean()

    if check_shapes:
        assert predicted_next_qvalues.data.dim(
        ) == 2, "убедитесь, что вы предсказали q-значения для всех действий в следующем состоянии"
        assert next_state_values.data.dim(
        ) == 1, "убедитесь, что вы вычислили V (s ') как максимум только по оси действий, а не по всем осям"
        assert target_qvalues_for_actions.data.dim(
        ) == 1, "что-то не так с целевыми q-значениями, они должны быть вектором"

    return loss, losses

## Simple DQN

Немного модифицированная версия кода, запускающего обучение Q-learning из прошлой тетрадки

In [None]:
def generate_session(env, network, opt, t_max=300, epsilon=0, train=False):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy(network, s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)

        if train:
            opt.zero_grad()
            loss, _ = compute_td_loss(network, [s], [a], [r], [next_s], [terminated and not truncated])
            loss.backward()
            opt.step()

        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward

In [None]:
def test_dqn():
    lr = .0001
    eps, eps_decay = .5, .998
    train_ep_len, eval_schedule = 10000, 50
    eval_rewards = deque(maxlen=5)

    env.reset()
    network = create_network(env.observation_space.shape[0], [128, 128], env.action_space.n)
    opt = torch.optim.Adam(network.parameters(), lr=lr)

    for ep in range(train_ep_len):
        _ = generate_session(env, network, opt, epsilon=eps, train=True)

        if (ep + 1) % eval_schedule == 0:
            ep_rew = generate_session(env, network, opt, epsilon=eps, train=False)
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay

In [None]:
test_dqn()

## DQN with Experience Replay

Теперь попробуем добавить поддержку памяти прецедентов (Replay Buffer), которая будет из себя представлять очередь из наборов: $\{(s, a, r, s', done)\}$.

Тогда во время обучения каждый новый переход будет добавляться в память, а обучение будет целиком производиться на переходах, просэмплированных из памяти прецедентов.

In [None]:
def sample_batch(replay_buffer, n_samples):
    # sample randomly `n_samples` samples from replay buffer
    # and split an array of samples into arrays: states, actions, rewards, next_actions, dones
    indices = np.random.choice(len(replay_buffer), n_samples)
    states, actions, rewards, next_actions, dones = [], [], [], [], []
    for i in indices:
        s, a, r, n_s, done = replay_buffer[i]
        states.append(s)
        actions.append(a)
        rewards.append(r)
        next_actions.append(n_s)
        dones.append(done)

    return np.array(states), np.array(actions), np.array(rewards), np.array(next_actions), np.array(dones)

In [None]:
def generate_session_rb(
        env, network, opt, replay_buffer, glob_step,
        train_schedule, batch_size,
        t_max=300, epsilon=0, train=False
):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy(network, s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)

        if train:
            # put new sample into replay_buffer
            replay_buffer.append((s, a, r, next_s, terminated and not truncated))

            if replay_buffer and glob_step % train_schedule == 0:
                # sample new batch: train_batch = ...
                train_batch = sample_batch(replay_buffer, batch_size)
                states, actions, rewards, next_states, is_done = train_batch

                opt.zero_grad()
                loss, _ = compute_td_loss(network, states, actions, rewards, next_states, is_done)
                loss.backward()
                opt.step()

        glob_step += 1
        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward, glob_step

После проверки скорости обучения можете поэкспериментировать с различными `train_schedule`, `batch_size`, а также с размером буфера `replay_buffer`

In [None]:
def test_dqn_replay_buffer():
    lr = .0001
    eps, eps_decay = .5, .998
    train_ep_len, eval_schedule = 10000, 50
    train_schedule, batch_size = 4, 32
    replay_buffer = deque(maxlen=4000)
    eval_rewards = deque(maxlen=5)
    glob_step = 0

    env.reset()
    network = create_network(env.observation_space.shape[0], [128, 128], env.action_space.n)
    opt = torch.optim.Adam(network.parameters(), lr=lr)

    reward_log = []
    for ep in range(train_ep_len):
        _, glob_step = generate_session_rb(
            env, network, opt, replay_buffer, glob_step, train_schedule, batch_size, epsilon=eps, train=True
        )

        if (ep + 1) % eval_schedule == 0:
            ep_rew, _ = generate_session_rb(
                env, network, opt, replay_buffer, 0, train_schedule, batch_size, epsilon=eps, train=False
            )
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))
            reward_log.append(running_avg_rew)

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay
    return reward_log

In [None]:
reward_log = test_dqn_replay_buffer()
plt.plot(reward_log)

## DQN with Prioritized Experience Replay

Добавим каждому переходу, хранящемуся в памяти, значение приоритета. Популярным вариантом является абсолютное значение TD-ошибки.

Однако, нужно помнить, что это значение быстро устаревает, если его не обновлять. Но и обновлять для всей памяти каждый раз - накладно. Приходится искать баланс между точностью и скоростью.

Здесь мы будем делать следующее:

- использовать TD-ошибку в кач-ве приоритета
- после использования батча при обучении, обновляем значения приоритета для этого батча в памяти
- будем периодически сортировать память для того, чтобы новые переходы заменяли собой те переходы, у которых наименьшие значения ошибки (т.е. наименьший приоритет)

In [None]:
def softmax(xs, temp=1000.):
    if not isinstance(xs, np.ndarray):
        xs = np.array(xs)

    # Обрати внимание, насколько большая температура по умолчанию!
    exp_xs = np.exp((xs - xs.max()) / temp)
    return exp_xs / exp_xs.sum()

def sample_prioritized_batch(replay_buffer, n_samples):
    # Sample randomly `n_samples` samples from replay buffer weighting by priority (sample's loss)
    # and split an array of samples into arrays: states, actions, rewards, next_actions, dones
    # Also, keep samples' indices (into `indices`) to return them too!
    losses = [sample[0] for sample in replay_buffer]
    probs = softmax(losses)
    indices = np.random.choice(len(replay_buffer), n_samples, p=probs)
    states, actions, rewards, next_actions, dones = [], [], [], [], []
    for i in indices:
        _, s, a, r, n_s, done = replay_buffer[i]
        states.append(s)
        actions.append(a)
        rewards.append(r)
        next_actions.append(n_s)
        dones.append(done)

    batch = np.array(states), np.array(actions), np.array(rewards), np.array(next_actions), np.array(dones)
    return batch, indices

def update_batch(replay_buffer, indices, batch, new_losses):
    """Updates batches with corresponding indices replacing their loss value."""
    states, actions, rewards, next_states, is_done = batch

    for i in range(len(indices)):
        new_batch = new_losses[i], states[i], actions[i], rewards[i], next_states[i], is_done[i]
        replay_buffer[indices[i]] = new_batch

def sort_replay_buffer(replay_buffer):
    """Sorts replay buffer to move samples with lesser loss to the beginning
    ==> they will be replaced with the new samples earlier."""
    new_rb = deque(maxlen=replay_buffer.maxlen)
    new_rb.extend(sorted(replay_buffer, key=lambda sample: sample[0]))
    return new_rb

In [None]:
def generate_session_prioritized_rb(
        env, network, opt, replay_buffer, glob_step,
        train_schedule, batch_size,
        t_max=300, epsilon=0, train=False
):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy(network, s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)

        if train:
            # Compute new sample loss (it's the second returning value - `losses` - from compute_td_loss)
            # we need `losses.numpy()[0]`
            with torch.no_grad():
                _, losses = compute_td_loss(network, [s], [a], [r], [next_s], [terminated and not truncated])

            # put new sample into replay_buffer
            replay_buffer.append((losses.numpy()[0], s, a, r, next_s, terminated and not truncated))

            if len(replay_buffer) >= batch_size and (glob_step + 1) % train_schedule == 0:
                # sample new batch: train_batch, indices = ...
                train_batch, indices = sample_prioritized_batch(replay_buffer, batch_size)
                states, actions, rewards, next_states, is_done = train_batch

                opt.zero_grad()
                loss, _ = compute_td_loss(network, states, actions, rewards, next_states, is_done)
                loss.backward()
                opt.step()

                with torch.no_grad():
                    # compute updated losses for the training batch and update batch in replay buffer
                    _, losses = compute_td_loss(network, states, actions, rewards, next_states, is_done)
                    update_batch(replay_buffer, indices, train_batch, losses.numpy())

            # periodically re-sort replay buffer to prioritize replacing with new samples those samples
            # that have the least loss
            if len(replay_buffer) >= batch_size and (glob_step + 1) % 25*train_schedule == 0:
                replay_buffer = sort_replay_buffer(replay_buffer)

        glob_step += 1
        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward, glob_step

In [None]:
def test_dqn_prioritized_replay_buffer():
    lr = .0001
    eps, eps_decay = .5, .998
    train_ep_len, eval_schedule = 10000, 50
    train_schedule, batch_size = 4, 32
    replay_buffer = deque(maxlen=4000)
    eval_rewards = deque(maxlen=5)
    glob_step = 0
    reward_log = []

    env.reset()
    network = create_network(env.observation_space.shape[0], [128, 128], env.action_space.n)
    opt = torch.optim.Adam(network.parameters(), lr=lr)

    for ep in range(train_ep_len):
        _, glob_step = generate_session_prioritized_rb(
            env, network, opt, replay_buffer, glob_step, train_schedule, batch_size, epsilon=eps, train=True
        )

        if (ep + 1) % eval_schedule == 0:
            ep_rew, _ = generate_session_prioritized_rb(
                env, network, opt, replay_buffer, 0, train_schedule, batch_size, epsilon=eps, train=False
            )
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))
            reward_log.append(running_avg_rew)

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay
    return(reward_log)

In [None]:
reward_log2 = test_dqn_prioritized_replay_buffer()
plt.plot(reward_log2)

<h1>Double DQN</h1>

In [None]:
def select_action_eps_greedy_double(networks, state, epsilon):
    """Выбирает действие epsilon-жадно."""
    if not isinstance(state, torch.Tensor):
        state = torch.tensor(state, dtype=torch.float32)
    Q_s = np.stack([networks[0](state).detach().numpy(), networks[1](state).detach().numpy()])
    Q_s = np.min(Q_s, axis=0)
    if epsilon < np.random.random():
        action = np.argmax(Q_s)
    else:
        n_actions = Q_s.shape[-1]
        action = np.random.choice(n_actions)

    action = int(action)
    return action

In [None]:
def generate_session_double_dqn(env, networks, opts, t_max=300, epsilon=0, train=False):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy_double(networks, s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)

        if train:
            for net_num in [0,1]:
                opts[net_num].zero_grad()
                loss, _ = compute_td_loss(networks[net_num], [s], [a], [r], [next_s], [terminated and not truncated])
                loss.backward()
                opts[net_num].step()

        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward

In [None]:
def test_double_dqn():
    lr = .0001
    eps, eps_decay = .5, .998
    train_ep_len, eval_schedule = 10000, 50
    eval_rewards = deque(maxlen=5)
    reward_log = []

    env.reset()
    networks = [create_network(env.observation_space.shape[0], [128, 128], env.action_space.n), create_network(env.observation_space.shape[0], [128, 128], env.action_space.n)]
    opts = opts = [torch.optim.Adam(networks[0].parameters(), lr=lr), torch.optim.Adam(networks[1].parameters(), lr=lr)]

    for ep in range(train_ep_len):
        _ = generate_session_double_dqn(env, networks, opts, epsilon=eps, train=True)

        if (ep + 1) % eval_schedule == 0:
            ep_rew = generate_session_double_dqn(env, networks, opts, epsilon=eps, train=False)
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))
            reward_log.append(running_avg_rew)

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay
    return reward_log

In [None]:
reward_log = test_double_dqn()
plt.plot(reward_log)
plt.show()

<h1>Monte Carlo</h1>

In [None]:
def select_action_eps_greedy_mc(network, state, epsilon):
    """Выбирает действие epsilon-жадно."""
    if not isinstance(state, torch.Tensor):
        state = torch.tensor(state, dtype=torch.float32)
    n_actions = env.action_space.n
    action_variants = torch.eye(n_actions)
    sa = torch.hstack([torch.vstack([state] * n_actions),action_variants])
    Q_s = np.ravel(network(sa).detach().numpy())
    if epsilon < np.random.random():
        action = np.argmax(Q_s)
    else:
        n_actions = Q_s.shape[-1]
        action = np.random.choice(n_actions)
    
    action = int(action)
    return action
def compute_mc_loss(
        network, states, actions, disco_rewards, check_shapes=False, regularizer=.1
):
    """ Считатет ошибку для монте карло, используя лишь операции фреймворка torch. """

    # переводим входные данные в тензоры
    states = torch.tensor(np.array(states), dtype=torch.float32)    # shape: [batch_size, state_size]
    actions = torch.tensor(actions, dtype=torch.long)     # shape: [batch_size]
    disco_rewards = torch.tensor(disco_rewards, dtype=torch.float32)  # shape: [batch_size]


    # получаем значения q для всех действий из текущих состояний
    actions_onehot = torch.zeros([actions.shape[0], n_actions])
    actions_onehot[:, actions] = 1
    sa = torch.hstack([states, actions_onehot])
    predicted_qvalues_for_actions = network(sa)

    losses = (predicted_qvalues_for_actions - disco_rewards) ** 2

    # MSE loss для минимизации
    loss = torch.mean(losses)
    # добавляем регуляризацию на значения Q
    loss += regularizer * predicted_qvalues_for_actions.mean()

    if check_shapes:
        assert predicted_next_qvalues.data.dim(
        ) == 2, "убедитесь, что вы предсказали q-значения для всех действий в следующем состоянии"
        assert next_state_values.data.dim(
        ) == 1, "убедитесь, что вы вычислили V (s ') как максимум только по оси действий, а не по всем осям"
        assert target_qvalues_for_actions.data.dim(
        ) == 1, "что-то не так с целевыми q-значениями, они должны быть вектором"

    return loss, losses

In [None]:
def sample_batch_montecarlo(replay_buffer, n_samples, gamma=0.99):
    # sample randomly `n_samples` samples from replay buffer
    # and split an array of samples into arrays: states, actions, discounted rewards
    planning_horizon = 100
    indices = np.random.choice(len(replay_buffer), n_samples)
    states, actions, disco_rewards = [], [], []
    for i in indices:
        r_disco = 0
        for j in range(planning_horizon):
            if j + i >= len(replay_buffer):
                break
            _, _, r, _, d = replay_buffer[j + i]
            if d:
                break
            r_disco += r * (gamma ** j)
        s, a, _, _, _ = replay_buffer[i]
        states.append(s)
        actions.append(a)
        disco_rewards.append(r_disco)

    return np.array(states), np.array(actions), np.array(disco_rewards)

In [None]:
def generate_session_montecarlo(
        env, network, opt, replay_buffer, glob_step,
        train_schedule, batch_size,
        t_max=300, epsilon=0, train=False
):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy_mc(network, s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)
        if train:
            # put new sample into replay_buffer
            replay_buffer.append((s, a, r, next_s, terminated and not truncated))

            if replay_buffer and glob_step % train_schedule == 0:
                # sample new batch: train_batch = ...
                train_batch = sample_batch_montecarlo(replay_buffer, batch_size)
                states, actions, disco_rewards = train_batch

                opt.zero_grad()
                loss, _ = compute_mc_loss(network, states, actions, disco_rewards)
                loss.backward()
                opt.step()

        glob_step += 1
        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward, glob_step

In [None]:
def test_montecarlo():
    lr = .0001
    eps, eps_decay = .5, .998
    train_schedule, batch_size = 4, 32
    train_ep_len, eval_schedule = 10000, 50
    replay_buffer = deque(maxlen=4000)
    eval_rewards = deque(maxlen=5)
    reward_log = []

    env.reset()
    network = create_network(env.observation_space.shape[0] + env.action_space.n, [128, 128], 1)
    opt = torch.optim.Adam(network.parameters(), lr=lr)
    glob_step = 0

    for ep in range(train_ep_len):
        _, glob_step = generate_session_montecarlo(
            env, network, opt, replay_buffer, glob_step, train_schedule, batch_size, epsilon=eps, train=True
        )

        if (ep + 1) % eval_schedule == 0:
            ep_rew = generate_session_montecarlo(env, network, opt, replay_buffer, 0, train_schedule, batch_size, epsilon=eps, train=False)
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))
            reward_log.append(running_avg_rew)

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay
    return reward_log

In [None]:
reward_log = test_montecarlo()
plt.plot(reward_log)
plt.show()

<h1>Dueling Monte Carlo</h1>

In [None]:
def compute_dmc_loss(
        networks, states, actions, disco_rewards, check_shapes=False, regularizer=.1
):
    """ Считатет ошибку для dueling монте карло, используя лишь операции фреймворка torch. """

    # переводим входные данные в тензоры
    states = torch.tensor(np.array(states), dtype=torch.float32)    # shape: [batch_size, state_size]
    actions = torch.tensor(actions, dtype=torch.long)     # shape: [batch_size]
    disco_rewards = torch.tensor(disco_rewards, dtype=torch.float32)  # shape: [batch_size]

    # получаем значения value для текущих состояний
    predicted_values = networks[0](states)
    losses_values = (predicted_values - disco_rewards) ** 2
    losses_values = 5 * torch.mean(losses_values)


    advantages = predicted_values.detach() - disco_rewards

    # получаем значения q для всех действий из текущих состояний
    actions_onehot = torch.zeros([actions.shape[0], n_actions])
    actions_onehot[:, actions] = 1
    sa = torch.hstack([states, actions_onehot])
    predicted_qvalues_for_actions = networks[1](sa)

    advantages = torch.stack([advantages] * predicted_qvalues_for_actions.shape[0])
    losses_advantage = (predicted_qvalues_for_actions - advantages) ** 2

    # MSE loss для минимизации
    losses_advantage = torch.mean(losses_advantage)
    # добавляем регуляризацию на значения Q
    loss = losses_advantage + losses_values + regularizer * predicted_qvalues_for_actions.mean()

    if check_shapes:
        assert predicted_next_qvalues.data.dim(
        ) == 2, "убедитесь, что вы предсказали q-значения для всех действий в следующем состоянии"
        assert next_state_values.data.dim(
        ) == 1, "убедитесь, что вы вычислили V (s ') как максимум только по оси действий, а не по всем осям"
        assert target_qvalues_for_actions.data.dim(
        ) == 1, "что-то не так с целевыми q-значениями, они должны быть вектором"

    return loss, losses_advantage
def generate_session_dmontecarlo(
        env, networks, opts, replay_buffer, glob_step,
        train_schedule, batch_size,
        t_max=300, epsilon=0, train=False
):
    """генерация сессии и обучение"""
    total_reward = 0
    s, _ = env.reset()
    epsilon = epsilon if train else 0.

    for t in range(t_max):
        a = select_action_eps_greedy_mc(networks[1], s, epsilon=epsilon)
        next_s, r, terminated, truncated, _ = env.step(a)
        if train:
            # put new sample into replay_buffer
            replay_buffer.append((s, a, r, next_s, terminated and not truncated))

            if replay_buffer and glob_step % train_schedule == 0:
                # sample new batch: train_batch = ...
                train_batch = sample_batch_montecarlo(replay_buffer, batch_size)
                states, actions, disco_rewards = train_batch

                opts[0].zero_grad()
                opts[1].zero_grad()
                loss, _ = compute_dmc_loss(networks, states, actions, disco_rewards)
                loss.backward()
                opts[0].step()
                opts[1].step()

        glob_step += 1
        total_reward += r
        s = next_s
        if terminated:
            break

    return total_reward, glob_step
def test_dmontecarlo():
    lr = .001
    eps, eps_decay = .5, .998
    train_schedule, batch_size = 5, 128
    train_ep_len, eval_schedule = 10000, 50
    replay_buffer = deque(maxlen=4000)
    eval_rewards = deque(maxlen=5)
    reward_log = []

    env.reset()
    networks = [create_network(env.observation_space.shape[0], [128, 128], 1), create_network(env.observation_space.shape[0] + env.action_space.n, [128, 128], 1)]
    opts = [torch.optim.Adam(networks[0].parameters(), lr=lr), torch.optim.Adam(networks[1].parameters(), lr=lr)]
    glob_step = 0

    for ep in range(train_ep_len):
        _, glob_step = generate_session_dmontecarlo(
            env, networks, opts, replay_buffer, glob_step, train_schedule, batch_size, epsilon=eps, train=True
        )

        if (ep + 1) % eval_schedule == 0:
            ep_rew = generate_session_dmontecarlo(env, networks, opts, replay_buffer, 0, train_schedule, batch_size, epsilon=eps, train=False)
            eval_rewards.append(ep_rew)
            running_avg_rew = np.mean(eval_rewards)
            print("Epoch: #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(ep, running_avg_rew, eps))
            reward_log.append(running_avg_rew)

            if eval_rewards and running_avg_rew >= 200.:
                print("Принято!")
                break

        eps *= eps_decay
    return reward_log
reward_log = test_dmontecarlo()
plt.plot(reward_log)
plt.show()