In [93]:
import torch.nn as nn
import torch
from torch import optim
import copy
import pickle
import random
import numpy as np

from sim_data_generation import  StateActionTransition
from src.envs.envs import BlackOilEnv

WEIGHT, HEIGHT = 80, 40

device = torch.device("cuda")

In [94]:
with open("saved_results_5.pkl", "rb") as f:
    data = pickle.load(f)

res = []

for i in data:
    for j in i:
        res.append(j)

example_fields = [i.state for i in res]
example_dqn = res[1]
example_field = example_dqn.state

In [95]:
example_field

array([[[1.80579588e+00, 1.56305696e+00, 3.25635336e-12, ...,
         9.35601278e-24, 2.82256183e+00, 0.00000000e+00],
        [1.80394850e+00, 1.55744098e+00, 2.43528046e-11, ...,
         5.22175017e-22, 2.80954332e+00, 0.00000000e+00],
        [1.80210807e+00, 1.55167870e+00, 1.61212636e-10, ...,
         2.28362679e-20, 2.79629271e+00, 0.00000000e+00],
        ...,
        [1.77916384e+00, 1.39540629e+00, 3.59415496e-06, ...,
         1.22040102e-11, 2.48265640e+00, 0.00000000e+00],
        [1.77989426e+00, 1.39377292e+00, 1.01229167e-06, ...,
         9.68349180e-13, 2.48076843e+00, 0.00000000e+00],
        [1.78098566e+00, 1.39266113e+00, 2.45708847e-07, ...,
         5.70606246e-14, 2.48030950e+00, 0.00000000e+00]],

       [[1.80381408e+00, 1.55993263e+00, 1.32069747e-11, ...,
         1.54102199e-22, 2.81382845e+00, 0.00000000e+00],
        [1.80174353e+00, 1.55418424e+00, 9.87702476e-11, ...,
         8.60044887e-21, 2.80024139e+00, 0.00000000e+00],
        [1.79966098e+00, 

In [98]:
import importlib
import unet_model
unet_module = importlib.reload(unet_model)

model = unet_model.UNet2()
model.to_cuda()

import numpy as np
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

263692

In [99]:
def create_new_model(ModelClass: unet_model.UNet2):
    model = ModelClass()
    target_model = ModelClass()

    #Загружаем модель на устройство, определенное в самом начале (GPU или CPU)
    model.to_cuda()
    target_model.to_cuda()

    #Сразу зададим оптимизатор, с помощью которого будем обновлять веса модели
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    return model, target_model, optimizer


def run_model(state, model):
    model_input = torch.FloatTensor(state).to(device)
    model_input = model_input.permute(0, 3, 1, 2)
    model_output = model(model_input)
    model_output = torch.flatten(model_output, start_dim=1)
    return model_output


def select_action(state, model, epsilon=0.05):
    if random.random() < epsilon:
        return [(random.randint(0, WEIGHT-1), random.randint(0, HEIGHT-1))]
    actions_indexes = run_model(state, model).cpu().detach().numpy().argmax(1)

    actions = []
    for index in actions_indexes:
        x = index // WEIGHT
        y = index % HEIGHT
        actions.append((x, y))

    return actions

def get_max_q(state, model):
    model_output = run_model(state, model)

    # находим максимальное значение - это и будет q функция
    return model_output.max(1).values.view(-1)    # [BATCH_SIZE]

In [100]:
input_example = np.stack([example_field, example_field])
input_example = example_field[np.newaxis, ...]

In [101]:
select_action(input_example, model)

[(1, 3)]

In [102]:
get_max_q(input_example, model)

tensor([3.8901], device='cuda:0', grad_fn=<ViewBackward0>)

In [103]:
import torch.nn.functional as F


gamma = 0.99
def fit(batch, model, target_model, optimizer):
    state, action, reward, next_state, done = batch

    # преобразуем внутри функций
    # state = torch.tensor(state).to(device).float()
    # next_state = torch.tensor(next_state).to(device).float()

    state = np.array(state)
    next_state = np.array(next_state)
    reward = torch.tensor(reward).to(device).float()
    action = torch.tensor(action).to(device)
    done = torch.tensor(done).to(device)

    # Считаем то, какие значения должна выдавать наша сеть
    with torch.no_grad():
        # Выбираем максимальное из значений Q-function для следующего состояния
        target_q = get_max_q(next_state, target_model)
        target_q[done] = 0

    target_q = reward + target_q * gamma


    flatten_index = torch.LongTensor([i[1] * HEIGHT + i[0] for i in action]).to(device)
    flatten_index = torch.unsqueeze(flatten_index, -1)

    q = run_model(state, model).gather(1, flatten_index)

    loss = F.mse_loss(q, target_q.unsqueeze(1))

    # Очищаем текущие градиенты внутри сети
    optimizer.zero_grad()

    # Применяем обратное распространение ошибки
    loss.backward()

    # Ограничиваем значения градиента. Необходимо, чтобы обновления не были слишком большими
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)

    # Делаем шаг оптимизации
    optimizer.step()

In [104]:
class Memory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, element: StateActionTransition):
        """Сохраняет элемент в циклический буфер"""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = element
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Возвращает случайную выборку указанного размера"""
        return list(zip(*random.sample(self.memory, batch_size)))

    def __len__(self):
        return len(self.memory)

In [105]:
def make_reward_number(reward) -> float:
    if not isinstance(reward, (float, np.float_)):
        return reward[0]
    return reward

In [106]:
mem = Memory(10)
mem.push([1, 2, 3])
mem.push([2, 3, 4])
mem.push([3, 5, 6])
mem.sample(3)

[(3, 2, 1), (5, 3, 2), (6, 4, 3)]

In [110]:
from tqdm import tqdm

sampled_batch = None

def train(env: BlackOilEnv):
    global sampled_batch

    #Количество обновлений model между обновлениями target model
    target_update = 100

    #Размер одного батча, который на вход принимает модель
    batch_size = 64

    #Количество шагов среды
    max_steps = 5000

    #Границы коэффициента exploration
    epsilon = 0.25

    #Создаем модель и буфер
    memory = Memory(2000)
    model, target_model, optimizer = create_new_model(unet_model.UNet2)
    rewards_by_target_updates = []

    env.reset()
    for step in tqdm(range(max_steps)):
        state = env.observation

        #Делаем шаг в среде

        model_input = state[np.newaxis, ...]        # добавляем размерность батча

        action = select_action(model_input, model, epsilon)[0]

        new_state, reward, done = env.step(action)
        reward = make_reward_number(reward)

        #Запоминаем опыт и, если нужно, перезапускаем среду
        memory.push((state, action, reward, new_state, done))
        if done:
              env.reset()

        #Градиентный спуск
        if step > batch_size:
            sampled_batch = memory.sample(batch_size)
            fit(sampled_batch, model, target_model, optimizer)

        if (step+1) % target_update == 0:
            target_model = copy.deepcopy(model)

            #Exploitation
            state = env.reset()
            done = False
            total_reward = 0
            while not done:
                model_input = state[np.newaxis, ...]        # добавляем размерность батча

                action = select_action(model_input, target_model, epsilon=0)

                state, reward, done = env.step(action)
                reward = make_reward_number(reward)
                total_reward += reward
            done = False
            state = env.reset()
            print(f"Testing... Get reward: {total_reward}")
            rewards_by_target_updates.append(total_reward)

    return rewards_by_target_updates

In [115]:
from src.envs.envs import BlackOilEnv
env = BlackOilEnv(days=3)

In [None]:
train(env)

  0%|          | 25/5000 [02:25<7:45:31,  5.61s/it]

In [30]:
49 * WEIGHT + HEIGHT

3960

In [338]:
tmp

tensor([[-0.6131,  0.9447, -0.7776,  ..., -0.7172, -2.0972, -3.4451],
        [-0.6149,  0.9476, -0.7789,  ..., -0.7172, -2.0973, -3.4451]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)

In [339]:
tmp2

tensor([4358,  198], device='cuda:0')

In [344]:
tmp2.shape

torch.Size([2])

In [348]:
tmp.shape

torch.Size([2, 3200])

In [352]:
tmp2 =

In [353]:
tmp.gather(0, tmp2)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [346]:
tmp.shape

torch.Size([2, 3200])

In [284]:
tmp

tensor([[-0.2551, -0.2551, -0.4403,  ..., -1.5179, -0.0023, -0.5512],
        [-0.2551, -0.2551, -0.4403,  ..., -1.5179, -0.0023, -0.5512]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)

In [287]:
tmp2

tensor([[ 3, 27],
        [ 3, 27]], device='cuda:0')

In [286]:
tmp.shape

torch.Size([2, 3200])

In [240]:
tmp.shape

torch.Size([2, 1, 40, 80])

In [241]:
tmp2

tensor([[47,  2],
        [74,  2]], device='cuda:0')

In [None]:
tmp.ga

In [None]:
.gather(1, action.unsqueeze(1))

In [171]:
sampled_batch

[(array([[[1.70479845e+00, 1.44145642e+00, 1.49586320e-13, ...,
           2.06077189e-26, 2.45739268e+00, 0.00000000e+00],
          [1.69915830e+00, 1.43821367e+00, 7.21193420e-13, ...,
           4.78113715e-25, 2.44375270e+00, 0.00000000e+00],
          [1.69374213e+00, 1.43540000e+00, 2.99446835e-12, ...,
           8.23033150e-24, 2.43119745e+00, 0.00000000e+00],
          ...,
          [1.52183354e+00, 1.67589795e+00, 2.46495097e-16, ...,
           4.71720555e-32, 2.55043771e+00, 0.00000000e+00],
          [1.52151019e+00, 1.69245141e+00, 8.47813235e-17, ...,
           5.54770486e-33, 2.57508207e+00, 0.00000000e+00],
          [1.52125012e+00, 1.70843928e+00, 2.56838963e-17, ...,
           5.06255724e-34, 2.59896346e+00, 0.00000000e+00]],
  
         [[1.70209903e+00, 1.44059180e+00, 1.62220348e-12, ...,
           2.41584525e-24, 2.45202990e+00, 0.00000000e+00],
          [1.69622813e+00, 1.43725424e+00, 7.82131066e-12, ...,
           5.60464293e-23, 2.43791106e+00, 0.0000

In [193]:
state, action, reward, next_state, done = sampled_batch

# Загружаем батч на выбранное ранее устройство
# [BATCH_SIZE, ...]

# преобразуем внутри функций
# state = torch.tensor(state).to(device).float()
# next_state = torch.tensor(next_state).to(device).float()

state = np.array(state)
next_state = np.array(next_state)
reward = torch.tensor(reward).to(device).float()
action = torch.tensor(action).to(device)
done = torch.tensor(done).to(device)

print("next state: ", next_state.shape)
# Считаем то, какие значения должна выдавать наша сеть
# target_q = torch.zeros(reward.size()[0]).float().to(device)
with torch.no_grad():
    # Выбираем максимальное из значений Q-function для следующего состояния
    model_input = next_state
    print(model_input.shape)

    target_q = get_max_q(model_input, model)
    target_q[done] = 0
target_q = reward + target_q * gamma

next state:  (2, 40, 80, 8)
(2, 40, 80, 8)


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [195]:
target_q.device

NameError: name 'target_q' is not defined

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 5 is not equal to len(dims) = 4

In [161]:
state, action, reward, next_state, done = sampled_batch

# Загружаем батч на выбранное ранее устройство
# [BATCH_SIZE, ...]

# преобразуем внутри функций
# state = torch.tensor(state).to(device).float()
# next_state = torch.tensor(next_state).to(device).float()

reward = torch.tensor(reward).to(device).float()
action = torch.tensor(action).to(device)
done = torch.tensor(done).to(device)

In [164]:
len(sampled_batch[0])

2

In [None]:

# with torch.no_grad():
#     # Выбираем максимальное из значений Q-function для следующего состояния
#     model_input = next_state[np.newaxis, ...]
#     target_q = get_max_q(model_input, target_model)
#     target_q[done] = 0
# target_q = reward + target_q * gamma