In [3]:
import torch.nn as nn
import torch
from torch import optim
import copy
import pickle
import random
import numpy as np

from sim_data_generation import  StateActionTransition
from src.envs.envs import BlackOilEnv

WEIGHT, HEIGHT = 80, 40

device = torch.device("cuda")

In [4]:
with open("saved_results_5.pkl", "rb") as f:
    data = pickle.load(f)

res = []

for i in data:
    for j in i:
        res.append(j)

example_fields = [i.state for i in res]
example_dqn = res[1]
example_field = example_dqn.state

In [5]:
example_field

array([[[1.80579588e+00, 1.56305696e+00, 3.25635336e-12, ...,
         9.35601278e-24, 2.82256183e+00, 0.00000000e+00],
        [1.80394850e+00, 1.55744098e+00, 2.43528046e-11, ...,
         5.22175017e-22, 2.80954332e+00, 0.00000000e+00],
        [1.80210807e+00, 1.55167870e+00, 1.61212636e-10, ...,
         2.28362679e-20, 2.79629271e+00, 0.00000000e+00],
        ...,
        [1.77916384e+00, 1.39540629e+00, 3.59415496e-06, ...,
         1.22040102e-11, 2.48265640e+00, 0.00000000e+00],
        [1.77989426e+00, 1.39377292e+00, 1.01229167e-06, ...,
         9.68349180e-13, 2.48076843e+00, 0.00000000e+00],
        [1.78098566e+00, 1.39266113e+00, 2.45708847e-07, ...,
         5.70606246e-14, 2.48030950e+00, 0.00000000e+00]],

       [[1.80381408e+00, 1.55993263e+00, 1.32069747e-11, ...,
         1.54102199e-22, 2.81382845e+00, 0.00000000e+00],
        [1.80174353e+00, 1.55418424e+00, 9.87702476e-11, ...,
         8.60044887e-21, 2.80024139e+00, 0.00000000e+00],
        [1.79966098e+00, 

In [6]:
import importlib
import unet_model
unet_module = importlib.reload(unet_model)

model = unet_model.UNet2()
model.to_cuda()

import numpy as np
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

263692

In [22]:
def create_new_model(ModelClass: unet_model.UNet2):
    model = ModelClass()
    target_model = ModelClass()

    #Загружаем модель на устройство, определенное в самом начале (GPU или CPU)
    model.to_cuda()
    target_model.to_cuda()

    #Сразу зададим оптимизатор, с помощью которого будем обновлять веса модели
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    return model, target_model, optimizer


def run_model(state, model):
    model_input = torch.FloatTensor(state).to(device)
    model_input = model_input.permute(0, 3, 1, 2)
    model_output = model(model_input)
    model_output = torch.flatten(model_output, start_dim=1)
    return model_output


def select_action(state, model, epsilon=0.05):
    if random.random() < epsilon:
        return [(random.randint(0, WEIGHT-1), random.randint(0, HEIGHT-1))]
    actions_indexes = run_model(state, model).cpu().detach().numpy().argmax(1)

    actions = []
    for index in actions_indexes:
        x = index // HEIGHT
        y = index % HEIGHT
        actions.append((x, y))

    return actions

def get_max_q(state, model):
    model_output = run_model(state, model)

    # находим максимальное значение - это и будет q функция
    return model_output.max(1).values.view(-1)    # [BATCH_SIZE]

In [11]:
input_example = np.stack([example_field, example_field])
input_example = example_field[np.newaxis, ...]

In [12]:
select_action(input_example, model)

[(2, 2)]

In [13]:
get_max_q(input_example, model)

tensor([2.9302], device='cuda:0', grad_fn=<ViewBackward0>)

In [23]:
import torch.nn.functional as F


gamma = 0.99
def fit(batch, model, target_model, optimizer):
    state, action, reward, next_state, done = batch

    # преобразуем внутри функций
    # state = torch.tensor(state).to(device).float()
    # next_state = torch.tensor(next_state).to(device).float()

    state = np.array(state)
    next_state = np.array(next_state)
    reward = torch.tensor(reward).to(device).float()
    action = torch.tensor(action).to(device)
    done = torch.tensor(done).to(device)

    # Считаем то, какие значения должна выдавать наша сеть
    with torch.no_grad():
        # Выбираем максимальное из значений Q-function для следующего состояния
        target_q = get_max_q(next_state, target_model)
        target_q[done] = 0

    target_q = reward + target_q * gamma


    flatten_index = torch.LongTensor([i[0] * HEIGHT + i[1] for i in action]).to(device)
    flatten_index = torch.unsqueeze(flatten_index, -1)

    q = run_model(state, model).gather(1, flatten_index)

    loss = F.mse_loss(q, target_q.unsqueeze(1))

    # Очищаем текущие градиенты внутри сети
    optimizer.zero_grad()

    # Применяем обратное распространение ошибки
    loss.backward()

    # Ограничиваем значения градиента. Необходимо, чтобы обновления не были слишком большими
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)

    # Делаем шаг оптимизации
    optimizer.step()

    print("model update... Ok")

In [24]:
class Memory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, element: StateActionTransition):
        """Сохраняет элемент в циклический буфер"""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = element
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Возвращает случайную выборку указанного размера"""
        return list(zip(*random.sample(self.memory, batch_size)))

    def __len__(self):
        return len(self.memory)

In [25]:
def make_reward_number(reward) -> float:
    if not isinstance(reward, (float, np.float_)):
        return reward[0]
    return reward

In [26]:
mem = Memory(10)
mem.push([1, 2, 3])
mem.push([2, 3, 4])
mem.push([3, 5, 6])
mem.sample(3)

[(1, 3, 2), (2, 5, 3), (3, 6, 4)]

In [27]:
from tqdm import tqdm

sampled_batch = None

def train(env: BlackOilEnv):
    global sampled_batch

    #Количество обновлений model между обновлениями target model
    target_update = 10

    #Размер одного батча, который на вход принимает модель
    batch_size = 8

    #Количество шагов среды
    max_steps = 1000

    #Границы коэффициента exploration
    epsilon = 0.25

    #Создаем модель и буфер
    memory = Memory(200)
    model, target_model, optimizer = create_new_model(unet_model.UNet2)
    rewards_by_target_updates = []

    env.reset()
    for step in tqdm(range(max_steps)):
        state = env.observation

        #Делаем шаг в среде

        model_input = state[np.newaxis, ...]        # добавляем размерность батча

        action = select_action(model_input, model, epsilon)[0]

        new_state, reward, done = env.step(action)
        reward = make_reward_number(reward)

        #Запоминаем опыт и, если нужно, перезапускаем среду
        memory.push((state, action, reward, new_state, done))
        if done:
              env.reset()

        #Градиентный спуск
        if step > batch_size:
            sampled_batch = memory.sample(batch_size)
            fit(sampled_batch, model, target_model, optimizer)

        if (step+1) % target_update == 0:
            target_model = copy.deepcopy(model)

            #Exploitation
            state = env.reset()
            done = False
            total_reward = 0
            while not done:
                model_input = state[np.newaxis, ...]        # добавляем размерность батча

                action = select_action(model_input, target_model, epsilon=0)[0]

                state, reward, done = env.step(action)
                reward = make_reward_number(reward)
                total_reward += reward
            done = False
            state = env.reset()
            print(f"Testing... Get reward: {total_reward}")
            rewards_by_target_updates.append(total_reward)

    return rewards_by_target_updates

In [28]:
from src.envs.envs import BlackOilEnv
env = BlackOilEnv(days=3)

In [None]:
train(env)

  0%|          | 0/1000 [00:00<?, ?it/s]