In [1]:
import torch.nn as nn
import torch
from torch import optim
import copy
import pickle
from sim_data_generation import  StateActionTransition
import random
WEIGHT, HEIGHT = 40, 80

device = torch.device("cuda")

In [2]:
with open("saved_results_5.pkl", "rb") as f:
    data = pickle.load(f)

In [3]:
res = []

for i in data:
    for j in i:
        res.append(j)

In [4]:
len(res)

128

In [5]:
example_dqn = res[1]

In [6]:
example_field = example_dqn.state

In [40]:
def get_conv_block(in_channel, out_channel, kernel_size=(3, 3)):
    return  nn.Sequential(
            nn.Conv2d(in_channels=in_channel,
                      out_channels=out_channel,
                      padding=1,
                      kernel_size=kernel_size),
            nn.BatchNorm2d(out_channel),
            nn.ReLU(),

            nn.Conv2d(in_channels=out_channel,
                      out_channels=out_channel,
                      padding=1,
                      kernel_size=kernel_size),
            nn.BatchNorm2d(out_channel),
            nn.ReLU()
        )

In [41]:
def get_deconv_block(in_channel, out_channel, kernel_size=(3, 3)):
    return  nn.Sequential(
        nn.ConvTranspose2d(in_channels=in_channel,
                           out_channels=out_channel,
                           padding=1,
                           kernel_size=kernel_size),
        nn.BatchNorm2d(out_channel),
        nn.ReLU(),

        nn.ConvTranspose2d(in_channels=out_channel,
                           out_channels=out_channel,
                           padding=1,
                           kernel_size=kernel_size),
        nn.BatchNorm2d(out_channel),
        nn.ReLU())

In [90]:
class Unet(nn.Module):
    def __init__(self):
        super().__init__()

        self.enc_conv0 = get_conv_block(8, 16)
        self.pool0 = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)   # 256 -> 128

        self.enc_conv1 = get_conv_block(16, 16)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True) # 128 -> 64

        self.enc_conv2 = get_conv_block(16, 32)
        self.pool2 = nn.MaxPool2d(kernel_size=2, return_indices=True)  # 64 -> 32

        # decoder (upsampling)
        self.upsample0 = nn.MaxUnpool2d(kernel_size=2) # 16 -> 32
        self.dec_conv0 = get_deconv_block(32, 16)

        self.upsample1 = nn.MaxUnpool2d(kernel_size=2) # 32 -> 64
        self.dec_conv1 = get_deconv_block(16, 16)

        self.upsample2 = nn.MaxUnpool2d(kernel_size=2) # 64 -> 128
        self.dec_conv2 = get_deconv_block(16, 1)


    def forward(self, x):
        # encoder
        e0, ind0 = self.pool0(self.enc_conv0(x))
        e1, ind1 = self.pool1(self.enc_conv1(e0))
        e2, ind2 = self.pool2(self.enc_conv2(e1))

        # decoder
        d0 = self.dec_conv0(self.upsample0(e2, indices=ind2, output_size=e1.size()))
        d1 = self.dec_conv1(self.upsample1(d0, indices=ind1, output_size=e0.size()))
        d2 = self.dec_conv2(self.upsample2(d1, indices=ind0,))
        return d2

In [112]:
class conv2DBatchNormRelu(nn.Module):
    def __init__(self, in_channels, n_filters, k_size, stride, padding):
        super(conv2DBatchNormRelu, self).__init__()

        self.unit = nn.Sequential(
            nn.Conv2d(int(in_channels), int(n_filters), kernel_size=k_size, padding=padding, stride=stride),
            nn.BatchNorm2d(int(n_filters)),
            nn.ReLU(inplace=True)
        )

    def forward(self, inputs):
        return self.unit(inputs)


In [119]:
class UNet2(nn.Module):
    def __init__(self):
        super().__init__()

        self.enc_conv0 = nn.Sequential(
            conv2DBatchNormRelu(8, 64, 3, 1, 1),
            conv2DBatchNormRelu(64, 64, 3, 1, 1)
        )
        self.pool0 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1)


        self.enc_conv1 = nn.Sequential(
            conv2DBatchNormRelu(64, 128, 3, 1, 1),
            conv2DBatchNormRelu(128, 128, 3, 1, 1),
        )
        self.pool1 = nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1)

        self.enc_conv2 = nn.Sequential(
            conv2DBatchNormRelu(128, 256, 3, 1, 1),
            conv2DBatchNormRelu(256, 256, 3, 1, 1),
            conv2DBatchNormRelu(256, 256, 3, 1, 1)
        )
        self.pool2 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)

        self.enc_conv3 = nn.Sequential(
            conv2DBatchNormRelu(256, 512, 3, 1, 1),
            conv2DBatchNormRelu(512, 512, 3, 1, 1),
            conv2DBatchNormRelu(512, 512, 3, 1, 1)
        )
        self.pool3 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1) # 32 -> 16

        # bottleneck
        self.bottle_neck = nn.Sequential(
            conv2DBatchNormRelu(512, 1024, 1, 1, 0),
            conv2DBatchNormRelu(1024, 512, 1, 1, 0)
        )


        self.upsample3 =  nn.ConvTranspose2d(512, 512, kernel_size=3, stride=2, padding=1)
        self.dec_conv3 = nn.Sequential(
            conv2DBatchNormRelu(512*2, 256, 3, 1, 1),
            conv2DBatchNormRelu(256, 256, 3, 1, 1),
            conv2DBatchNormRelu(256, 256, 3, 1, 1),
        )

        self.upsample2 =  nn.ConvTranspose2d(256, 256, kernel_size=3, stride=2, padding=1)
        self.dec_conv2 = nn.Sequential(
            conv2DBatchNormRelu(256*2, 128, 3, 1, 1),
            conv2DBatchNormRelu(128, 128, 3, 1, 1),
            conv2DBatchNormRelu(128, 128, 3, 1, 1),
        )

        self.upsample1 =  nn.ConvTranspose2d(128, 128, kernel_size=3, stride=2, padding=1) # 64 -> 128
        self.dec_conv1 = nn.Sequential(
            conv2DBatchNormRelu(128*2, 64, 3, 1, 1),
            conv2DBatchNormRelu(64, 64, 3, 1, 1),
        )

        self.upsample0 =  nn.ConvTranspose2d(64, 64, kernel_size=3, stride=2, padding=1)
        self.dec_conv0 = nn.Sequential(
            conv2DBatchNormRelu(64*2, 1, 3, 1, 1),
            conv2DBatchNormRelu(1, 1, 3, 1, 1),

            nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(1),
        )

    def forward(self, x):
        # encoder
        e0_0 = self.enc_conv0(x)
        e0_1 = self.pool0(e0_0)

        e1_0 = self.enc_conv1(e0_1)
        e1_1 = self.pool1(e1_0)

        e2_0 = self.enc_conv2(e1_1)
        e2_1 = self.pool2(e2_0)

        e3_0 = self.enc_conv3(e2_1)
        e3_1 = self.pool3(e3_0)

        # bottleneck
        b = self.bottle_neck(e3_1)

        d3 = self.dec_conv3(torch.cat([self.upsample3(b, output_size=e3_0.size()), e3_0], 1))
        d2 = self.dec_conv2(torch.cat([self.upsample2(d3, output_size=e2_0.size()), e2_0], 1))
        d1 = self.dec_conv1(torch.cat([self.upsample1(d2, output_size=e1_0.size()), e1_0], 1))
        d0 = self.dec_conv0(torch.cat([self.upsample0(d1, output_size=e0_0.size()), e0_0], 1))

        return d0

In [232]:
import importlib
import unet_model
unet_module = importlib.reload(unet_model)

In [233]:
model = unet_model.UNet2()

In [234]:
input = torch.FloatTensor(example_field)
input = input.permute(2, 0, 1)
input = input.unsqueeze(0)

In [235]:
input.shape

torch.Size([1, 8, 40, 80])

In [236]:
model

UNet2(
  (bottle_neck): Sequential(
    (0): conv2DBatchNormRelu(
      (unit): Sequential(
        (0): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
    )
  )
  (last_block): Sequential(
    (0): Conv2d(1, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [237]:
model(input).max(0).values[0, ...].shape

torch.Size([40, 80])

In [225]:
import numpy as np
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

263692

In [198]:
model.bottle_neck

Sequential(
  (0): conv2DBatchNormRelu(
    (unit): Sequential(
      (0): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
      (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
  )
)

In [96]:
def create_new_model(model_class):
    model = copy.deepcopy(model_class())
    target_model = copy.deepcopy(model_class())

    #Загружаем модель на устройство, определенное в самом начале (GPU или CPU)
    model.to(device)
    target_model.to(device)

    #Сразу зададим оптимизатор, с помощью которого будем обновлять веса модели
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    return model, target_model, optimizer

In [72]:
import torch.nn.functional as F

gamma = 0.99
def fit(batch, model, target_model, optimizer):
    state, action, reward, next_state, done = batch

    # Загружаем батч на выбранное ранее устройство
    state = torch.tensor(state).to(device).float()
    next_state = torch.tensor(next_state).to(device).float()
    reward = torch.tensor(reward).to(device).float()
    action = torch.tensor(action).to(device)
    done = torch.tensor(done).to(device)

    # Считаем то, какие значения должна выдавать наша сеть
    # target_q = torch.zeros(reward.size()[0]).float().to(device)
    with torch.no_grad():
        # Выбираем максимальное из значений Q-function для следующего состояния
        target_q = target_model(next_state).max(1)[0].view(-1)
        target_q[done] = 0
    target_q = reward + target_q * gamma

    # Текущее предсказание
    q = model(state).gather(1, action.unsqueeze(1))

    loss = F.mse_loss(q, target_q.unsqueeze(1))

    # Очищаем текущие градиенты внутри сети
    optimizer.zero_grad()

    # Применяем обратное распространение ошибки
    loss.backward()

    # Ограничиваем значения градиента. Необходимо, чтобы обновления не были слишком большими
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)

    # Делаем шаг оптимизации
    optimizer.step()

In [122]:

def select_action(state, epsilon, model):
    model_input = torch.FloatTensor(state).to(device)
    model_input = model_input.permute(2, 0, 1)

    if random.random() < epsilon:
        return float(random.randint(0, WEIGHT)),\
               float(random.randint(0, HEIGHT))

    model_output = model(model_input).to("cpu").detach().numpy()
    x, y = model_output
    x = round(x)
    y = round(y)
    return x, y

In [123]:
model = big_model.to(device)
select_action(example_field, model=model, epsilon=0.05)

(0, 0)

In [76]:
class Memory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, element: StateActionTransition):
        """Сохраняет элемент в циклический буфер"""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = element
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Возвращает случайную выборку указанного размера"""
        return list(zip(*random.sample(self.memory, batch_size)))

    def __len__(self):
        return len(self.memory)

In [None]:


def train():
    #Количество обновлений model между обновлениями target model
    target_update = 500

    #Размер одного батча, который на вход принимает модель
    batch_size = 64

    #Количество шагов среды
    max_steps = 5000

    #Границы коэффициента exploration
    max_epsilon = 0.5
    min_epsilon = 0.1

    #Создаем модель и буфер
    memory = Memory(2000)
    model, target_model, optimizer = create_new_model(big_model)
    rewards_by_target_updates = []

    for step in range(max_steps):
        #Делаем шаг в среде
        epsilon = max_epsilon - (max_epsilon - min_epsilon) * step / max_steps
        action = select_action(state, epsilon, model)
        new_state, reward, done, _ = env.step(action)

        #Запоминаем опыт и, если нужно, перезапускаем среду
        memory.push((state, action, reward, new_state, done))
        if done:
              state = env.reset()
              done = False
        else:
              state = new_state

        #Градиентный спуск
        if step > batch_size:
             fit(memory.sample(batch_size), model, target_model, optimizer)

        if step % target_update == 0:
            target_model = copy.deepcopy(model)

            #Exploitation
            state = env.reset()
            total_reward = 0
            while not done:
                action = select_action(state, 0, target_model)
                state, reward, done, _ = env.step(action)
                total_reward += reward

            done = False
            state = env.reset()
            rewards_by_target_updates.append(total_reward)

    return rewards_by_target_updates