In [0]:
import numpy as np
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
import cv2
import json
import glob

import matplotlib.pyplot as plt
%matplotlib inline

from gym.spaces import Discrete, Box

import torch
import torchvision

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import imageio

import pickle

from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


# Environment

In [0]:
###
SEGMENT_LENGTH = 20
EXPLORATION_COST = 0.1
PRED_REWARD = SEGMENT_LENGTH*0.1/2
###

In [0]:
def generate_a_segment(length=50, noise=0.0, free_location=False):
  if free_location:
    l1 = np.random.randint(1, length-1)
    l2 = np.random.randint(1, length-1)
    left = min([l1, l2])
    right = max([l1, l2])+1
    segment = [0]*left + [1]*(right-left) + [0]*(length-right)
  else:
    right = np.random.randint(1, length)
    segment = [0]*right + [1]*(length-right)
  segment = np.array(segment) + noise * np.random.random(length)
  return segment, float(right)

In [0]:
class ImgEnv(object):
    def __init__(self, max_steps=1000, window=1, segment_length=50, noise=0.0, free_location=False,
                 expl_cost=0.05, pred_reward=1):
      self.pred_reward = pred_reward
      self.free_location = free_location
      self.noise = noise
      self.segment_length = segment_length
      self.action_space = (Discrete(self.segment_length), Discrete(self.segment_length))
      self.observation_space = Discrete(self.segment_length)
      self.window = window
      self.max_steps = max_steps
      self.to_draw = np.zeros((self.max_steps+1, 1, self.segment_length, 3)).astype(int)
      self.expl_cost = expl_cost
      self.window = window

    def seed(self, seed):
        np.random.seed(seed)

    def reset(self, NEXT=True):
      self.pos = None
      self.num_steps=0
      if NEXT:
        self.curr_img, self.target = generate_a_segment(length=self.segment_length, noise=self.noise, free_location=self.free_location)

      self.state = np.zeros((2, self.segment_length)).astype(int)
      return self.state

    
    def get_frame(self,t,pred=None):
        true_image = np.zeros((1, self.segment_length, 3)).astype(int)
        true_image[:, self.curr_img==1, 2] = 255
        true_image[:, self.curr_img==0, 0] = 255

        segment_plot = np.zeros((1, self.segment_length, 3)).astype(int)+128
        segment_plot[:, self.state[0]==1, :] = true_image[:, self.state[0]==1, :]
        if self.pos is not None:
            segment_plot[:, self.pos, :] = np.clip(segment_plot[:, self.pos, :] +170, 0, 255)
        if pred is not None:
            segment_plot[:, pred, 1] = np.clip(segment_plot[:, pred, 1] + 170, 0, 255)
        
        self.to_draw[t,:,:,:] = segment_plot
        
    def draw(self,e):
        true_image = np.zeros((1, self.segment_length, 3)).astype(int)
        true_image[:, self.curr_img==1, 2] = 255
        true_image[:, self.curr_img==0, 0] = 255

        array_list = [
            np.vstack([s_plot, true_image]) for s_plot in self.to_draw[:self.num_steps]
        ]
        image_list = []
        for a in array_list:
            fig = plt.figure()
            plt.yticks([])
            plt.imshow(a)
            plt.hlines(0.5, -0.5, self.segment_length-0.5)
            fig.canvas.draw()
            image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
            image  = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
            plt.close(fig)
            image_list.append(image)
        imageio.mimsave('./{}.gif'.format(e), image_list, fps=1)
       
    def step(self, action):
      """
      
      """
      
      """if self.num_steps >= self.max_steps:
        done = True
        reward = self.target == np.argmax(action[1, :])
        self.get_frame(int(self.num_steps))
        self.num_steps += 1
        return self.state, reward, done
      
      action = np.unravel_index(action.argmax(), action.shape)
      """
      if action[0]==0:  
          self.pos = action[1]
          done = False
          reward = -self.expl_cost
          self.state[1, max(self.pos - self.window, 0):min(self.pos + self.window + 1, self.segment_length)] = self.curr_img[max(self.pos - self.window, 0):min(self.pos + self.window + 1, self.segment_length)]
          self.state[0, max(self.pos - self.window, 0):min(self.pos + self.window + 1, self.segment_length)] = np.ones(min(self.pos + self.window + 1, self.segment_length) - max(self.pos - self.window, 0)).astype(int)
          pred = None
      else:
          done = True
          reward = (self.target == action[1]).item()*self.pred_reward
          pred = action[1]
          
      self.get_frame(int(self.num_steps), pred=pred)
      self.num_steps += 1
      return self.state, reward, done

    def get_current_obs(self):
        return self.state

    def close(self):
        pass

# Agent

In [0]:
class Agent(object):
    def __init__(self, epsilon, segment_length, random_can_stop=False):
        self.epsilon = epsilon
        self.segment_length = segment_length
        self.random_can_stop=random_can_stop
    
    def set_epsilon(self,e):
        self.epsilon = e

    def act(self,s,train=True, must_stop=False):
        """ This function should return the next action to do:
        an array of length self.segment_length with a random exploration of epsilon"""
        if train and np.random.rand() <= self.epsilon:
            if must_stop:
                return torch.tensor([1, random.randrange(self.segment_length)], device=device, dtype=torch.long)
            if self.random_can_stop:
                return torch.tensor([random.randrange(2), random.randrange(self.segment_length)], device=device, dtype=torch.long)
            return torch.tensor([0, random.randrange(self.segment_length)], device=device, dtype=torch.long)
        a = self.learned_act(s)        
        if must_stop:
            return torch.tensor([1, torch.argmax(a[:, 1, :])], device=device, dtype=torch.long)
        max_index = torch.argmax(a)
        return torch.tensor([max_index / self.segment_length, max_index % self.segment_length], device=device, dtype=torch.long)

    def learned_act(self,s):
        """ Act via the policy of the agent, from a given state s
        it proposes an action a"""
        pass

    def reinforce(self, s, n_s, a, r, game_over_):
        """ This function is the core of the learning algorithm. 
        It takes as an input the current state s_, the next state n_s_
        the action a_ used to move from s_ to n_s_ and the reward r_.
        
        Its goal is to learn a policy.
        """
        pass

    def save(self):
        """ This function returns basic stats if applicable: the
        loss and/or the model"""
        pass

    def load(self):
        """ This function allows to restore a model"""
        pass

In [0]:
# Memory Replay
from collections import namedtuple
import random

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'game_over'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def __len__(self):
        return len(self.memory)   
     
    def remember(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if batch_size <= self.__len__():
            return random.sample(self.memory, batch_size)
        return random.choices(self.memory, k=batch_size) 

In [0]:

import torch
import torchvision

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import imageio

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class nav_and_pred_model(nn.Module):
  def __init__(self):
    super(nav_and_pred_model, self).__init__()
    self.conv1 = nn.Conv1d(2, 64, kernel_size=3, padding=1)
    self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
    self.conv3 = nn.Conv1d(64, 3, kernel_size=3, padding=1)
    self.linear1 = nn.Linear(60, 60)
    self.linear2 = nn.Linear(60, 40)
    
  def forward(self, x):
    #print(x.shape)
    x = F.relu(self.conv1(x))
    #print(x.shape)
    x = F.relu(self.conv2(x))
    #print(x.shape)
    x = F.relu(self.conv3(x))
    #print(x.shape)
    x = x.view(x.size(0), -1)
    #print(x.shape)
    x = self.linear1(x)
    #print(x.shape)
    x = self.linear2(x)
    #print(x.shape)
    x = x.view(x.size(0), 2, -1)
    #print(x.shape)
    return x

class navigation_model(nn.Module):
  def __init__(self):
    super(navigation_model, self).__init__()
    self.conv1 = nn.Conv1d(2, 64, kernel_size=3, padding=1)
    self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
    self.conv3 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
    self.linear1 = nn.Linear(64*SEGMENT_LENGTH, SEGMENT_LENGTH)
    
  def forward(self, x):
    #print(x.shape)
    x = F.relu(self.conv1(x))
    #print(x.shape)
    x = F.relu(self.conv2(x))
    #print(x.shape)
    x = F.relu(self.conv3(x))
    #print(x.shape)
    x = x.view(x.size(0), -1)
    #print(x.shape)
    x = self.linear1(x)
    #print(x.shape)
    x = x.view(x.size(0), 1, -1)
    #print(x.shape)
    return x  
  
class assembled_model(nn.Module):
  def __init__(self, oracle=True):
    super(assembled_model, self).__init__()
    
  def forward(self, s):
    #print(s.shape)
    x = navigation_model(s)
    #print(x.shape)
    return x

model_full = nav_and_pred_model()
navigation_model = navigation_model()
assembled_model = assembled_model()

if use_cuda:
  print('Using GPU')
  model_full.cuda()
  navigation_model.cuda()
  assembled_model.cuda()
else:
  print('Using CPU')
  
optimizer_full = optim.RMSprop(model_full.parameters(), lr=1e-3)
optimizer_navigation = optim.RMSprop(navigation_model.parameters(), lr=1e-5)

Using GPU


In [0]:
def oracle_array(state):
    segment_length = state.shape[-1]
    first_1_index_list = np.where(state[1, :]==1)[0]
    last_0_index_list = np.where((state[1, :]==0) & (state[0, :]==1))[0]
    if len(first_1_index_list) == 0:
        first_1_index = segment_length-1
    else:
        first_1_index = first_1_index_list[0]
    if len(last_0_index_list) == 0:
        last_0_index = 0
    else:
        last_0_index = last_0_index_list[-1]
    return torch.tensor([0.]*(last_0_index + 1) + [1.]*(first_1_index - last_0_index) + [0.]*(segment_length-first_1_index - 1), device=device)/(first_1_index - last_0_index)

def oracle(s_batch):
    state_cpu = s_batch.cpu().numpy()
    return torch.cat([oracle_array(s).unsqueeze(0) for s in state_cpu])*PRED_REWARD

In [0]:
# sanity check
ex = np.array([
    
    [[0,0,0,1,0,1],
     [0,0,0,1,0,1]],
    [[0,0,1,0,0,1],
     [0,0,1,0,0,1]]
    
])
ex2 = np.array([
    
    [1,1,0,1,1,0],
    [0,0,0,1,1,0]
    
])

print(oracle(torch.tensor(ex, device=device)))
oracle_array(ex2)

tensor([[0.0000, 1.6667, 1.6667, 1.6667, 0.0000, 0.0000],
        [0.0000, 2.5000, 2.5000, 0.0000, 0.0000, 0.0000]], device='cuda:0')


tensor([0.0000, 0.0000, 0.5000, 0.5000, 0.0000, 0.0000], device='cuda:0')

In [0]:
class DQN_separated_net(Agent):
    def __init__(self, epsilon=0.3, segment_length=50, memory_size=300, batch_size = 16):
        super(DQN_separated_net, self).__init__(epsilon=epsilon, segment_length=segment_length, random_can_stop=False)

        # Memory
        self.memory = ReplayMemory(memory_size)
        
        # Batch size when learning
        self.batch_size = batch_size

    def learned_act(self, s, pred_oracle=True, requ_grad=False):
        if requ_grad:
            if pred_oracle:
                return torch.cat([navigation_model(s), oracle(s).unsqueeze(1)], 1)
        with torch.no_grad():
            if pred_oracle:
                return torch.cat([navigation_model(s), oracle(s).unsqueeze(1)], 1)
        #to do without oracle

    def reinforce(self, s_, a_, n_s_, r_, game_over_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember(s_, a_, n_s_, r_, game_over_)
        
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        
        
        non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False
        non_final_next_states = torch.cat(batch.next_state)[non_final_mask]
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action).view(-1, 2)
        reward_batch = torch.cat(batch.reward)
        #non_final_next_states = torch.cat(batch.next_state)[non_final_index]
        
        # print(state_batch.shape)
        state_values = self.learned_act(state_batch, requ_grad=True)
        state_action_values = torch.cat([s[a[0].item(), a[1].item()].unsqueeze(0) for s, a in zip(state_values, batch.action)])
        
        next_state_values = torch.zeros(self.batch_size, device=device)
        
        if len(non_final_next_states)>0:
            next_state_values[non_final_mask] = (self.learned_act(non_final_next_states, requ_grad=False).max(2)[0]).max(1)[0]
        expected_state_action_values = next_state_values + reward_batch 

        # print(state_action_values.shape)
        # print(expected_state_action_values.shape)
        
        loss = F.smooth_l1_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask])#.unsqueeze(1))

        # Optimize the model
        optimizer_navigation.zero_grad()
        loss.backward()
        for param in navigation_model.parameters():
            # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
            param.grad.data.clamp_(-1, 1)
        optimizer_navigation.step()
        return float(loss)

    """
    def save(self,name_weights='model.h5',name_model='model.json'):
        self.model.save_weights(name_weights, overwrite=True)
        with open(name_model, "w") as outfile:
            json.dump(self.model.to_json(), outfile)
            
    def load(self,name_weights='model.h5',name_model='model.json'):
        with open(name_model, "r") as jfile:
            model = model_from_json(json.load(jfile))
        model.load_weights(name_weights)
        model.compile("adam", "mse")
        self.model = model
    """

In [0]:
# sanity check
state_test = torch.zeros((1, 2, SEGMENT_LENGTH), device=device)
#state_test[0, :, int(SEGMENT_LENGTH/2):SEGMENT_LENGTH] = torch.ones((2, SEGMENT_LENGTH-int(SEGMENT_LENGTH/2)), device=device)


#state_test[0, 0, 0:5] = torch.ones(5, device=device)

print(state_test)
agent.learned_act(state_test)#[0][0][10].item()

# Train & Test

In [0]:
def train(agent,env,epoch,prefix='', draw=False, next_after_e=True):
    # Number of won games
    loss_list = []
    reward_list = []
    score = 0
    loss = 0
    for e in range(epoch):
        # At each epoch, we restart to a fresh game and get the initial state
        state = env.reset(NEXT=next_after_e)
        state = torch.tensor(state, device=device, dtype=torch.float).unsqueeze(0)
        # This assumes that the games will terminate
        game_over = False

        tot_reward = 0
        tot_loss = 0
        while not game_over:
            # The agent performs an action
            action = agent.act(state, must_stop=env.num_steps >= env.max_steps)
            # Apply an action to the environment, get the next state, the reward
            # and if the games end
            prev_state = state
            state, reward, game_over = env.step(action)
            tot_reward += reward
            
            #action.view(1, 2, -1) #########
            state = torch.tensor(state, device=device, dtype=torch.float).unsqueeze(0)
            reward = torch.tensor(reward, device=device, dtype=torch.float).unsqueeze(0)
            game_over_tensor = torch.tensor(game_over, device=device).unsqueeze(0)
            # Update the counters

            # Apply the reinforcement strategy
            loss = agent.reinforce(prev_state, action, state, reward, game_over_tensor)
            tot_loss += loss
            #print((e, epoch, loss, tot_reward))
            # Save action
            if draw:
                env.draw(prefix+str(e))

        # Update stats
        score += tot_reward
        loss_list.append(tot_loss/env.num_steps)
        reward_list.append(tot_reward)
        
        if e%50 ==0:
            print("Epoch {:03d}/{:03d} | Loss {:.4f} | reward {}".format(e, epoch, loss, tot_reward)) 
            print(agent.learned_act(state_test)[0][0][10].item())
            #agent.save(name_weights=prefix+'model.h5',name_model=prefix+'model.json')
        
    with open("/content/drive/My Drive/CEPchallenge/rewards_{}.pickle".format(prefix), 'wb') as handle:
        pickle.dump(reward_list, handle)

    with open("/content/drive/My Drive/CEPchallenge/losses_{}.pickle".format(prefix), 'wb') as handle:
        pickle.dump(loss_list, handle)
        
        
def test(agent, env,epochs, prefix='', next_after_e=True, draw=False, path_save='/drive/My Drive/CEPchallenge/gif_segment/'):
    # Number of won games
    score = 0
    loss_list = []
    reward_list = []
    for e in range(epochs):
        state = env.reset(NEXT=next_after_e)
        state = torch.tensor(state, device=device, dtype=torch.float).unsqueeze(0)
        # This assumes that the games will terminate
        game_over = False
        tot_reward = 0
        tot_loss = 0
        while not game_over:
            must_stop = env.num_steps >= env.max_steps
            action = agent.act(state, must_stop=must_stop, train=False)
            state, reward, game_over = env.step(action)
            
            tot_reward += reward
            
            #action.view(1, 2, -1) #########
            state = torch.tensor(state, device=device, dtype=torch.float).unsqueeze(0)
            reward = torch.tensor(reward, device=device, dtype=torch.float).unsqueeze(0)
            game_over_tensor = torch.tensor(game_over, device=device).unsqueeze(0)
                        
        # Save as a mp4
        if draw:
            env.draw(path_save + prefix + str(e))
    
        # Update stats
        score = score + tot_reward
        
        loss_list.append(tot_loss/env.num_steps)
        reward_list.append(tot_reward)
        print("epoch {} : total reward = {}".format(e, tot_reward))
      
        with open("/content/drive/My Drive/CEPchallenge/rewards_test_{}.pickle".format(prefix), 'wb') as handle:
            pickle.dump(reward_list, handle)
  
        with open("/content/drive/My Drive/CEPchallenge/losses_test_{}.pickle".format(prefix), 'wb') as handle:
            pickle.dump(loss_list, handle)
    print('Final score: '+str(score/epochs))

In [0]:
#SEGMENT_LENGTH

batch_size = 16
memory_size = 50

#Initialize the environment!
env = ImgEnv(max_steps=SEGMENT_LENGTH+5, window=0, segment_length=SEGMENT_LENGTH, expl_cost=0.05,
            pred_reward=1.)

env.reset()

# Initialize the agent!
#agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.1, memory_size=50)

agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.3, memory_size=memory_size, batch_size=batch_size)
train(agent,env,2000,prefix='DQN0_to_2000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN0_to_2000', next_after_e=True, draw=True)

agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.2, memory_size=memory_size, batch_size=batch_size)
train(agent,env,3000,prefix='DQN2000_to_5000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN2000_to_5000', next_after_e=True, draw=True)

agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.1, memory_size=memory_size, batch_size=batch_size)
train(agent,env,10000,prefix='DQN5000_to_15000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN5000_to_15000', next_after_e=True, draw=True)

In [0]:
agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.3, memory_size=memory_size, batch_size=batch_size)
train(agent,env,2000,prefix='DQN', next_after_e=True, draw=False)
test(agent,env,5,prefix='DQN', next_after_e=True, draw=True)

epoch 0 : total reward = -10.499999999999979
epoch 1 : total reward = -10.499999999999979
epoch 2 : total reward = -10.499999999999979
epoch 3 : total reward = -10.499999999999979
epoch 4 : total reward = -10.499999999999979
Final score: -10.499999999999979


In [0]:
agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.3, memory_size=memory_size, batch_size=batch_size)

test(agent,env,20,prefix='DQN0', next_after_e=True, draw=True)

train(agent,env,2000,prefix='DQN0_to_2000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN0_to_2000', next_after_e=True, draw=True)

agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.2, memory_size=memory_size, batch_size=batch_size)
train(agent,env,3000,prefix='DQN2000_to_5000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN2000_to_5000', next_after_e=True, draw=True)

agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.1, memory_size=memory_size, batch_size=batch_size)
train(agent,env,10000,prefix='DQN5000_to_15000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN5000_to_15000', next_after_e=True, draw=True)

epoch 0 : total reward = -0.05
epoch 1 : total reward = -0.05
epoch 2 : total reward = -0.05
epoch 3 : total reward = -0.05
epoch 4 : total reward = -0.05
epoch 5 : total reward = -0.05
epoch 6 : total reward = -0.05
epoch 7 : total reward = -0.05
epoch 8 : total reward = -0.05
epoch 9 : total reward = -0.05
epoch 10 : total reward = -0.05
epoch 11 : total reward = -0.05
epoch 12 : total reward = -0.05
epoch 13 : total reward = -0.05
epoch 14 : total reward = -0.05
epoch 15 : total reward = -0.05
epoch 16 : total reward = -0.05
epoch 17 : total reward = -0.05
epoch 18 : total reward = -0.05
epoch 19 : total reward = -0.05
Final score: -0.05000000000000001
Epoch 000/2000 | Loss 0.0003 | reward -0.05
-0.0335487425327301




Epoch 050/2000 | Loss 0.0055 | reward 0.0
-0.01738676428794861
Epoch 100/2000 | Loss 0.0003 | reward -0.05
-0.00027565378695726395
Epoch 150/2000 | Loss 0.0003 | reward -0.15000000000000002
0.018405666574835777
Epoch 200/2000 | Loss 0.0027 | reward -0.15000000000000002
0.05279618501663208
Epoch 250/2000 | Loss 0.0009 | reward -0.05
0.09053246676921844
Epoch 300/2000 | Loss 0.0038 | reward -0.05
0.07119772583246231
Epoch 350/2000 | Loss 0.0009 | reward -0.05
0.07513059675693512
Epoch 400/2000 | Loss 0.0025 | reward 0.9
0.08406300842761993
Epoch 450/2000 | Loss 0.0006 | reward -0.25
0.06295250356197357
Epoch 500/2000 | Loss 0.0042 | reward 0.95
0.0986301526427269
Epoch 550/2000 | Loss 0.0012 | reward -0.3
0.07480169832706451
Epoch 600/2000 | Loss 0.0005 | reward -0.05
0.0812639445066452
Epoch 650/2000 | Loss 0.0099 | reward 0.75
0.12117545306682587
Epoch 700/2000 | Loss 0.0331 | reward 0.85
0.1761949360370636
Epoch 750/2000 | Loss 0.0036 | reward -0.2
0.16052378714084625
Epoch 800/2000 |

In [0]:
agent = DQN_separated_net(segment_length=SEGMENT_LENGTH, epsilon=0.1, memory_size=memory_size, batch_size=32)
train(agent,env,10000,prefix='DQN15000_to_25000', next_after_e=True, draw=False)
test(agent,env,20,prefix='DQN15000_to_25000', next_after_e=True, draw=True)



Epoch 000/10000 | Loss 0.0000 | reward 0.8
0.7571251392364502
Epoch 050/10000 | Loss 0.0001 | reward 0.85
0.7503526210784912
Epoch 100/10000 | Loss 0.0001 | reward 0.7
0.7459627985954285
Epoch 150/10000 | Loss 0.0001 | reward 0.8
0.7430984973907471
Epoch 200/10000 | Loss 0.0002 | reward 0.75
0.7399482727050781
Epoch 250/10000 | Loss 0.0002 | reward 0.75
0.754023551940918
Epoch 300/10000 | Loss 0.0001 | reward 0.65
0.7372941970825195
Epoch 350/10000 | Loss 0.0001 | reward 0.85
0.7458965182304382
Epoch 400/10000 | Loss 0.0005 | reward 0.65
0.7467712163925171
Epoch 450/10000 | Loss 0.0001 | reward 0.75
0.7244037985801697
Epoch 500/10000 | Loss 0.0003 | reward 0.7
0.7393524646759033
Epoch 550/10000 | Loss 0.0001 | reward 0.8
0.755128026008606
Epoch 600/10000 | Loss 0.0000 | reward 0.8
0.7556039094924927
Epoch 650/10000 | Loss 0.0001 | reward 0.8
0.7836644649505615
Epoch 700/10000 | Loss 0.0002 | reward 0.65
0.6995659470558167
Epoch 750/10000 | Loss 0.0001 | reward 0.75
0.741199254989624
Ep