#Тетрадка 6. Имитационное обучение:  алгоритм SQIL.

Научиться имитировать поведение экспертов из демонстраций может быть непросто, особенно в средах с многомерными непрерывными наблюдениями и неизвестной динамикой. Методы копирования поведения, страдают от сдвига распределения (distribution shift): поскольку агент жадно имитирует продемонстрированные действия, он может отклоняться от продемонстрированных состояний, что приводит к накоплению ошибок.

Оригинальная статья: [SQIL: Imitation Learning via Reinforcement Learning with Sparse Rewards](https://arxiv.org/abs/1905.11108)

В данной части мы попробуем применить идею из этой статьи для среды ``LunarLanderContinuous-v2``, воспользовавшись кодом алгоритма DDPG с прошлого семинара.

In [None]:
# @title Установка зависимостей
try:
    import google.colab
    COLAB = True
except ModuleNotFoundError:
    COLAB = False
    pass

if COLAB:
    !apt -qq update -y
    !apt -qq install swig -y
    !pip -q install box2d-py
    !pip -q install "gymnasium[classic-control, box2d, atari, accept-rom-license]"
    !pip -q install piglet
    !pip -q install imageio_ffmpeg
    !pip -q install moviepy==1.0.3

43 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 43 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Selecting previously unselected package swig4.0.
(Reading database ... 121666 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubuntu1_all.deb ...
Unpacking swig (4.0.2-1ubuntu1) ...
Setting up swig4.0 (4.0.2-1ubuntu1) ...
Setting up swig (4.0.2-1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.5/374.5 kB[0

In [None]:
# @title Импортирование зависимостей
import math
import random

import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

# библиотеки и функции, которые потребуеются для показа видео
import glob
import io
import base64
from IPython import display as ipythondisplay
from IPython.display import HTML
import matplotlib.pyplot as plt
from gymnasium.wrappers.record_video import RecordVideo

%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


def show_video(folder="./video"):
    mp4list = glob.glob(folder + '/*.mp4')
    if len(mp4list) > 0:
        mp4 = sorted(mp4list, key=lambda x: x[-15:], reverse=True)[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def show_progress(rewards_batch, log, reward_range=None):
    """
    Удобная функция, которая отображает прогресс обучения.
    """

    if reward_range is None:
        reward_range = [-990, +10]
    mean_reward = np.mean(rewards_batch)
    log.append([mean_reward])

    clear_output(True)
    plt.figure(figsize=[8, 4])
    plt.subplot(1, 2, 1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.legend(loc=4)
    plt.grid()
    plt.grid()
    plt.show()

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

print(device)

cuda


# Нормализация действия и добавление шума:

In [None]:
class NormalizedActions(gym.ActionWrapper):

    def action(self, action):
        low_bound = self.action_space.low
        upper_bound = self.action_space.high
        # [L, M, R]
        # actions are in [-1, 1]
        ####### Здесь ваш код ########
        action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
        ##############################
        action = np.clip(action, low_bound, upper_bound)

        return action

    def reverse_action(self, action):
        pass


class GaussNoise:

    def __init__(self, sigma):
        super().__init__()

        self.sigma = sigma

    def get_action(self, action):
        # add normal noise
        ####### Здесь ваш код ########
        noisy_action = np.random.normal(action, self.sigma)
        ##############################
        return noisy_action

# Value и Policy сети:

In [None]:
class ValueNetwork(nn.Module):
    def __init__(
            self,
            num_inputs,
            num_actions,
            hidden_size,
    ):
        super().__init__()
        # add 3 linear layers
        ####### Здесь ваш код ########
        self.net = nn.Sequential(
            nn.Linear(num_inputs + num_actions, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )
        ##############################

    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = self.net(x)
        return x


class PolicyNetwork(nn.Module):
    def __init__(
            self,
            num_inputs,
            num_actions,
            hidden_size,
    ):
        super().__init__()

        # определяем граф вычисления для Policy Network
        # add 3 linear layers
        ####### Здесь ваш код ########
        self.net = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, num_actions),
            nn.Tanh()
        )
        ##############################

    def forward(self, state):
        # определяем прямой проход по графу вычислений
        # x =
        x = state
        x = self.net(x)
        return x

    def get_action(self, state):
        """
        функция для выбора действия
        """
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        action = self.forward(state)
        action = action.detach().cpu().numpy()[0]
        action = np.clip(action, -1.0, 1.0)

        return action

# DDPG обновление

<img src="https://spinningup.openai.com/en/latest/_images/math/5811066e89799e65be299ec407846103fcf1f746.svg">

Оригинальная статья:  <a href="https://arxiv.org/abs/1509.02971">Continuous control with deep reinforcement learning Arxiv</a>

In [None]:
def ddpg_update(
        state,
        action,
        reward,
        next_state,
        done,
        gamma=0.99,
        min_value=-np.inf,
        max_value=np.inf,
        soft_tau=0.001,
):
    state = torch.tensor(state, dtype=torch.float32).to(device)
    next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
    action = torch.tensor(action, dtype=torch.float32).to(device)
    reward = torch.tensor(reward, dtype=torch.float32).unsqueeze(1).to(device)
    done = torch.tensor(np.float32(done)).unsqueeze(1).to(device)

    # считаем policy loss по формуле выше, используя value_net
    ####### Здесь ваш код ########
    policy_loss = -value_net(state, policy_net(state)).mean()
    ##############################

    next_action = target_policy_net(next_state)
    target_value = target_value_net(next_state, next_action.detach())
    # считаем таргет Q функцию
    ####### Здесь ваш код ########
    expected_value = reward + gamma * target_value * (1.0 - done)
    ##############################
    expected_value = torch.clamp(expected_value, min_value, max_value)

    value = value_net(state, action)
    value_loss = nn.MSELoss()(value, expected_value.detach())


    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - soft_tau) + param.data * soft_tau
        )

    for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - soft_tau) + param.data * soft_tau
        )

# Стандартная и комбинированная память прецедентов:  

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)


class CombinedReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.demo = ReplayBuffer(self.capacity)
        self.agent = ReplayBuffer(self.capacity)

    def push_demo(self, state, action, reward, next_state, smart_done):
        # модифицируем вознаграждения, как этого предусматривает алгоритм SQIL
        # reward =
        ####### Здесь ваш код ########
        reward = 1
        ##############################
        self.demo.push(state, action, reward, next_state, smart_done)

    def push(self, state, action, reward, next_state, smart_done):
        # модифицируем вознаграждения, как этого предусматривает алгоритм SQIL
        # reward =
        ####### Здесь ваш код ########
        reward = 0
        ##############################
        self.agent.push(state, action, reward, next_state, smart_done)


    def sample(self, batch_size):
        demo_batch_size = min(batch_size // 2, len(self.demo))
        # набираем данные из обоих буферов
        ####### Здесь ваш код ########
        states, actions, rewards, next_states, dones = self.agent.sample(batch_size - demo_batch_size)
        demo_states, demo_actions, demo_rewards, demo_next_states, demo_dones = self.demo.sample(demo_batch_size)
        ##############################

        return np.concatenate([states, demo_states]), \
               np.concatenate([actions, demo_actions]), \
               np.concatenate([rewards, demo_rewards]), \
               np.concatenate([next_states, demo_next_states]), \
               np.concatenate([dones, demo_dones])

    def __len__(self):
        return len(self.demo) + len(self.agent)

 # Метод ``generate_session``


In [None]:
def generate_session(train=False):
    """эпизод взаимодействие агента со средой, а также вызов процесса обучения"""
    total_reward = 0
    state = env.reset()

    done = False
    while not done:
        action = policy_net.get_action(state)
        if train:
            action = noise.get_action(action)
        next_state, reward, term, trunc = env.step(action)
        done = term or trunc
        if train:
            replay_buffer.push(state, action, reward, next_state, term)
            if len(replay_buffer) > replay_buffer_size + 250:
                ddpg_update(*replay_buffer.sample(batch_size))
        total_reward += reward
        state = next_state
        if done:
            break

    return total_reward

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf
!pip install mujoco_py==2.0.2.8
!pip install free-mujoco-py
!pip install git+https://github.com/Farama-Foundation/d4rl@master#egg=d4rl


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
software-properties-common is already the newest version (0.99.22.8).
The following additional packages will be installed:
  libegl-dev libgl-dev libgles-dev libgles1 libglu1-mesa libglu1-mesa-dev libglvnd-core-dev
  libglvnd-dev libglx-dev libopengl-dev libosmesa6
The following NEW packages will be installed:
  libegl-dev libgl-dev libgl1-mesa-dev libgl1-mesa-glx libgles-dev libgles1 libglew-dev
  libglu1-mesa libglu1-mesa-dev libglvnd-core-dev libglvnd-dev libglx-dev libopengl-dev libosmesa6
  libosmesa6-dev
0 upgraded, 15 newly installed, 0 to remove and 43 not upgraded.
Need to get 3,952 kB of archives.
After this operation, 18.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libglx-dev amd64 1.4.0-1 [14.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libgl-dev amd64 1.4.0-1 [101 kB]
Get:3 http://archive.ubuntu.com/ubuntu 

In [None]:
%env D4RL_SUPPRESS_IMPORT_ERROR=1
import os
import sys
import random
import csv
from datetime import datetime
import pickle
import collections
import math

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import d4rl


ENV_NAMES = ['halfcheetah']  # walker2d, halfcheetah, hopper
DATASET_TYPES = ['medium-expert']  # medium, medium-expert, medium-replay


def download_d4rl_data():
    datasets = []

    data_dir = 'data/'
    print(data_dir)

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    for env_name in ENV_NAMES:
        for dataset_type in DATASET_TYPES:

            name = f'{env_name}-{dataset_type}-v2'
            pkl_file_path = os.path.join(data_dir, name)

            print("processing: ", name)

            env = gym.make(name)
            dataset = env.get_dataset()

            N = dataset['rewards'].shape[0]
            data_ = collections.defaultdict(list)

            use_timeouts = False
            if 'timeouts' in dataset:
                use_timeouts = True

            episode_step = 0
            paths = []
            for i in range(N):
                done_bool = bool(dataset['terminals'][i])
                if use_timeouts:
                    final_timestep = dataset['timeouts'][i]
                else:
                    final_timestep = (episode_step == 1000-1)
                for k in ['observations', 'next_observations', 'actions', 'rewards', 'terminals']:
                    data_[k].append(dataset[k][i])
                if done_bool or final_timestep:
                    episode_step = 0
                    episode_data = {}
                    for k in data_:
                        episode_data[k] = np.array(data_[k])
                    paths.append(episode_data)
                    data_ = collections.defaultdict(list)
                episode_step += 1

            returns = np.array([np.sum(p['rewards']) for p in paths])
            num_samples = np.sum([p['rewards'].shape[0] for p in paths])
            print(f'Number of samples collected: {num_samples}')
            print(
                f'Trajectory returns: mean = {np.mean(returns)}, std = {np.std(returns)}, max = {np.max(returns)}, min = {np.min(returns)}')

            with open(f'{pkl_file_path}.pkl', 'wb') as f:
                pickle.dump(paths, f)


download_d4rl_data()

env: D4RL_SUPPRESS_IMPORT_ERROR=1
Compiling /usr/local/lib/python3.10/dist-packages/mujoco_py/cymj.pyx because it changed.
[1/1] Cythonizing /usr/local/lib/python3.10/dist-packages/mujoco_py/cymj.pyx


INFO:root:running build_ext
INFO:root:building 'mujoco_py.cymj' extension
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-310
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-310/usr
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-310/usr/local
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-310/usr/local/lib
INFO:root:creating /usr/local/lib/python3.10/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-31

data/
processing:  halfcheetah-medium-expert-v2
Downloading dataset: http://rail.eecs.berkeley.edu/datasets/offline_rl/gym_mujoco_v2/halfcheetah_medium_expert-v2.hdf5 to /root/.d4rl/datasets/halfcheetah_medium_expert-v2.hdf5


load datafile: 100%|██████████| 9/9 [00:05<00:00,  1.59it/s]


Number of samples collected: 2000000
Trajectory returns: mean = 7713.38037109375, std = 2970.242431640625, max = 11252.03515625, min = -310.23419189453125


In [None]:
dataset = "medium-expert"       # medium / medium-replay / medium-expert
rtg_scale = 1000                # scale to normalize returns to go

env_name = 'HalfCheetah-v3'
rtg_target = 6000
env_d4rl_name = f'halfcheetah-{dataset}-v2'



max_eval_ep_len = 1000      # max len of one evaluation episode
num_eval_ep = 10            # num of evaluation episodes per iteration

batch_size = 64             # training batch size
lr = 1e-4                   # learning rate
wt_decay = 1e-4             # weight decay
warmup_steps = 10000        # warmup steps for lr scheduler

# total updates = max_train_iters x num_updates_per_iter
max_train_iters = 200
num_updates_per_iter = 100

context_len = 20        # K in decision transformer
n_blocks = 3            # num of transformer blocks
embed_dim = 128         # embedding (hidden) dim of transformer
n_heads = 1             # num of transformer heads
dropout_p = 0.1         # dropout probability



# load data from this file
dataset_path = f'data/{env_d4rl_name}.pkl'

# saves model and csv in this directory
log_dir = "./dt_runs/"


if not os.path.exists(log_dir):
    os.makedirs(log_dir)


# training and evaluation device
device_name = 'cuda'
device = torch.device(device_name)
print("device set to: ", device)

device set to:  cuda


In [None]:
with open(dataset_path, 'rb') as f:
    trajectories = pickle.load(f)

In [None]:
replay_buffer_size = 100000
replay_buffer = CombinedReplayBuffer(replay_buffer_size)

for dd in trajectories:
  for state, next_state, action, _, terminated in zip(*dd.values()):
    replay_buffer.push_demo(state, action, 1, next_state, terminated)

In [None]:
max_steps = 1000
env = NormalizedActions(gym.make(env_name, max_episode_steps=max_steps))
noise = GaussNoise(sigma=0.001)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
hidden_dim = 128

value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
    target_param.data.copy_(param.data)

value_lr = 1e-3
policy_lr = 1e-4

value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr, weight_decay=1e-6)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr, weight_decay=1e-6)

batch_size = 256

  finally:


In [None]:
def generate_session(train=False):
    """эпизод взаимодействие агента со средой, а также вызов процесса обучения"""
    total_reward = 0
    state = env.reset()

    done = False
    while not done:
        action = policy_net.get_action(state)
        if train:
            action = noise.get_action(action)
        next_state, reward, done, info = env.step(action)
        if train:
            replay_buffer.push(state, action, reward, next_state, done)
            if len(replay_buffer) > replay_buffer_size + 1500:
                ddpg_update(*replay_buffer.sample(batch_size))
        total_reward += reward
        state = next_state
        if done:
            break

    return total_reward

In [None]:
len(replay_buffer)

100000

In [None]:
generate_session(train=False)

-2.2861037507085555

In [None]:
env = NormalizedActions(gym.make(env_name, max_episode_steps=max_steps))

valid_mean_rewards = []
for i in range(100):
    session_rewards_train = [generate_session(train=True) for _ in range(10)]

    mean_reward = np.mean(session_rewards_train)
    print(f"epoch #{i:02d}\tmean reward (train) = {mean_reward:.3f}\t")

    if mean_reward > 7500:
        print("Выполнено!")
        break

env.close()

  finally:


epoch #00	mean reward (train) = -421.217	
epoch #01	mean reward (train) = -455.620	
epoch #02	mean reward (train) = 306.871	
epoch #03	mean reward (train) = 878.872	
epoch #04	mean reward (train) = 948.230	
epoch #05	mean reward (train) = 859.924	
epoch #06	mean reward (train) = 1954.258	
epoch #07	mean reward (train) = 2338.029	
epoch #08	mean reward (train) = 2488.746	
epoch #09	mean reward (train) = 2108.793	
epoch #10	mean reward (train) = 3000.234	
epoch #11	mean reward (train) = 2758.159	
epoch #12	mean reward (train) = 4275.054	
epoch #13	mean reward (train) = 3048.987	
epoch #14	mean reward (train) = 4752.983	
epoch #15	mean reward (train) = 5272.843	
epoch #16	mean reward (train) = 5715.157	
epoch #17	mean reward (train) = 4839.628	
epoch #18	mean reward (train) = 4715.176	
epoch #19	mean reward (train) = 4918.997	
epoch #20	mean reward (train) = 4262.915	


# Задание гиперпараметров и инициализация всего и вся:

In [None]:
env_name = "LunarLanderContinuous-v2"

max_steps = 350
env = NormalizedActions(gym.make(env_name, max_episode_steps=max_steps))

noise = GaussNoise(sigma=0.1)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
hidden_dim = 512

value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
    target_param.data.copy_(param.data)

value_lr = 1e-3
policy_lr = 1e-4

value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr, weight_decay=1e-6)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr, weight_decay=1e-6)

batch_size = 128

### Генерация экспертных данных:

In [None]:
from gymnasium.envs.box2d.lunar_lander import heuristic

replay_buffer_size = 100000
replay_buffer = CombinedReplayBuffer(replay_buffer_size)

noise = GaussNoise(sigma=0.1)
episodes = 0
while len(replay_buffer) < replay_buffer_size:
    episodes += 1
    done = False
    state, _ = env.reset()
    episode_reward = 0
    while not done:
        action = noise.get_action(heuristic(env, state))
        next_state, reward, terminated, truncated, _ = env.step(action)
        replay_buffer.push_demo(state, action, 1, next_state, terminated)
        done = terminated or truncated
        episode_reward += reward
        state = next_state
    if episodes % 100 == 0:
        print(f"episode: {episodes}, reward: {episode_reward}, replay_size:", len(replay_buffer))
noise = GaussNoise(sigma=0.001)
env.close()

  logger.warn(


episode: 100, reward: 299.05841105292996, replay_size: 20271
episode: 200, reward: 286.6264456101353, replay_size: 40888
episode: 300, reward: 271.27487454272466, replay_size: 60650
episode: 400, reward: 239.37888402131665, replay_size: 80747


In [None]:
env = NormalizedActions(gym.make(env_name, max_episode_steps=max_steps))

valid_mean_rewards = []
for i in range(100):
    session_rewards_train = [generate_session(train=True) for _ in range(10)]

    mean_reward = np.mean(session_rewards_train)
    print(f"epoch #{i:02d}\tmean reward (train) = {mean_reward:.3f}\t")

    if mean_reward > 200:
        print("Выполнено!")
        break

env.close()

epoch #00	mean reward (train) = -227.405	
epoch #01	mean reward (train) = -347.010	
epoch #02	mean reward (train) = -190.934	
epoch #03	mean reward (train) = 34.860	
epoch #04	mean reward (train) = 33.180	
epoch #05	mean reward (train) = 229.317	
Выполнено!


# Посмотрим за полетом:

In [None]:
env = NormalizedActions(gym.make(env_name, max_episode_steps=max_steps, render_mode='rgb_array'))
env = RecordVideo(env, f"./video")

done = False

state, info = env.reset()

while not done:
    action = policy_net.get_action(state)
    state, _, term, trunc, info = env.step(action)
    done = term or trunc

env.close()
show_video()

Moviepy - Building video /content/video/rl-video-episode-0.mp4.
Moviepy - Writing video /content/video/rl-video-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/video/rl-video-episode-0.mp4
