### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import os
# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
#from Env import CabDriver


Using TensorFlow backend.


In [2]:
# Import routines

import numpy as np
import math
import random
from itertools import permutations,product

# Defining hyperparameters
m = 5 # number of cities, ranges from 1 ..... m
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1
C = 5 # Per hour fuel and other costs
R = 9 # per hour revenue from a passenger


class CabDriver():

    def __init__(self):
        """initialise your state and define your action space and state space"""
        action_list = list(permutations(range(0,m) ,2))
        action_list.append((0,0))
        self.action_space = np.array(action_list) #action space is unique 2 values(source & destination) + the no op
        self.state_space = list(product(*[list(range(0,m)), list(range(0,t)), list(range(0,d))])) #State space from MDP:
        #𝑠=𝑋𝑖𝑇𝑗𝐷𝑘 𝑤ℎ𝑒𝑟𝑒 𝑖=0…𝑚−1;𝑗=0….𝑡−1;𝑘=0…..𝑑−1, Where 𝑋𝑖 represents a driver’s current location, 𝑇𝑗 represents time component (more specifically hour of the day), 𝐷𝑘 represents the day of the week
        self.state_size = len(self.state_space)
        self.action_size = len(self.action_space)
        self.state_init = random.choice(self.state_space) #Initialises to any random self_space
        self.encode_vector = np.array([24*7, 7, 1]).reshape(3, 1)


        # Start the first round
        self.reset()


    ## Encoding state (or state-action) for NN input

    def state_encod_arch1(self, curr_state, batch_size=1):
        """convert the state into a vector so that it can be fed to the NN. This method converts a given state into a vector format. Hint: The vector is of size m + t + d."""
       
        #Encoded values of m + t + d
        
        curr_state = np.array(curr_state).reshape(1, 3)
        #print(curr_state.shape)
        #enc_mat = self.encode_vector
        # pos = (state[0]*24*7) + (state[1]*7) + state[2]
        
        pos_mat = np.dot(curr_state, self.encode_vector)
        state_encod =  np.zeros((1, self.state_size))
        # state_encod[pos] = 1
        for i in range(batch_size):
            state_encod[i][pos_mat[i]] = 1

        return np.reshape(state_encod, [1, env.state_size])
    


    # Use this function if you are using architecture-2 
    # def state_encod_arch2(self, state, action):
    #     """convert the (state-action) into a vector so that it can be fed to the NN. This method converts a given state-action pair into a vector format. Hint: The vector is of size m + t + d + m + m."""

        
    #     return state_encod


    ## Getting number of requests

    def requests(self, state):
        """Determining the number of requests basis the location. 
        Use the table specified in the MDP and complete for rest of the locations"""
        location = state[0]
        requests = 0
        if location == 0:
            requests = np.random.poisson(2)

        if location == 1:
            requests = np.random.poisson(12)   #MDP Poisson distribution
        
        if location == 2:
            requests = np.random.poisson(4)    #MDP Poisson distribution
            
        if location == 3:
            requests = np.random.poisson(7)    #MDP Poisson distribution

        if location == 4:
            requests = np.random.poisson(8)    #MDP Poisson distribution  
            
        if requests > 15:
            requests = 15

        possible_actions_index = random.sample(range(0, (m-1)*m), requests) # (0,0) is not considered as customer request
        possible_actions_index.append(20) #add the index of No-OP action (0, 0)
        actions = [self.action_space[i] for i in possible_actions_index]

        print('Number of actions available', len(actions))
        return possible_actions_index, actions   



    def reward_func(self, state, action, Time_matrix):
        """Takes in state, action and Time-matrix and returns the reward"""
        if action[0] == action[1]:
            reward = -C 
            return reward

        #print('reward:' ,state, action)
        p = action[0]
        q = action[1]
        i = state[0]
        time = state[1]
        day = state[2]
        #print('reward vals:', (p, q, i, time, day))
        t_pq = Time_matrix[p][q][time][day]
        t_ip = Time_matrix[i][p][time][day]
        
        
        reward = (R*t_pq)-(C*(t_pq+t_ip))
        return reward


    def next_state_func(self, state, action, Time_matrix):
        """Takes state and action as input and returns next state"""
        
        #print('next_state :', state, action)
        p = action[0]
        q = action[1]
        i = state[0]
        time_curr = state[1]
        day_curr = state[2]
        #print('next_state_vals :', (p, q, i, time_curr, day_curr))
        time_next = time_curr + Time_matrix[p][q][time_curr][day_curr]

        day_next = int((day_curr+int(time_next/24)) % 7)
        time_next = int(time_next % 24)
            
        next_state = (q,time_next,day_next)
        return next_state


    def reset(self):
        return self.action_space, self.state_space, self.state_init


#### Defining Time Matrix

In [3]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [4]:
#Defining a function to save the Q-dictionary as a pickle file
def save_pickle(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [5]:
class DQNAgent:
    def __init__(self, state_size, action_size, discount_factor=0.95, learning_rate=0.01,
                       epsilon=0.99, epsilon_decay=0.99, epsilon_min=0.01):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate        
        self.epsilon_max = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model_history = None
        
        self.batch_size = 32
        #self.batch_size = 1
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
    

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       

        # hidden layers
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))     
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model


    def get_action(self, cstate, all_actions, pos_act_ind):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment
        actions = all_actions[pos_act_ind]
        q_value = 0
        if np.random.rand() <= self.epsilon_max:
            # explore: choose a random action from all possible actions
            print('Exploring')
            action = random.choice(actions)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            print('Exploiting')
            #cstate = cstate.reshape(1, self.state_size) 
            q_value = self.model.predict(x=cstate)
            max_index = np.argmax(q_value[0])
            action = all_actions[max_index] if max_index in pos_act_ind else random.choice(actions)
        print('Selected action ', action)    
        return action, q_value
        

    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            
            actions, rewards, done = [], [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                
                # Write your code from here
                # 1. Predict the target from earlier model
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)
                
            # 2. Get the target for the Q-network
            
            target = self.model.predict(update_input)
            target_qval = self.model.predict(update_output)
            #print(target, target.shape)
            #print(target.shape, target_qval.shape)

            #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                #print(i, actions[i])
                if done[i]:
                    #target[i][actions[i]] = rewards[i]
                    target[i] = rewards[i]
                else: # non-terminal state
                    #target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                    target[i] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                
        # 4. Fit your model and track the loss values
            return self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=1)
            
            
    def save(self, name):
        self.model.save(name)

In [8]:
# to store rewards in each episode
rewards_per_episode, episodes, q_vals_per_episode, loss = [], [], [], []

# make dir to store model weights
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")

# n_episodes
n_episodes = 50

### DQN block

In [9]:
 # Call all the initialised variables of the environment
env = CabDriver()
#Call the DQN agent
dqn = DQNAgent(env.state_size, env.action_size)

for episode in range(n_episodes):

    # Write code here
    # Call the environment
   
    _,_,curr_state = env.reset()
    state_size = env.state_size
    pos_act_ind, actions = env.requests(curr_state)
    action = random.choice(actions)
    #action_size = len(actions)
    reward = 0
    curr_time = 0
    q_val_list = []
    #print(curr_state)
    
    
    terminal_state = False
    print("Episode :", episode)
    
    while not terminal_state:
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        encoded_state = env.state_encod_arch1(curr_state)
        #encoded_state = np.reshape(encoded_state, [1, env.state_size])
        action, q_value = dqn.get_action(encoded_state, env.action_space, pos_act_ind)
        # 2. Evaluate your reward and next state
        reward = reward + env.reward_func(curr_state, action, Time_matrix)
        next_state = env.next_state_func(curr_state,action,Time_matrix)
        
        q_val_list.append(q_value)
        
        p = action[0]
        q = action[1]
        i = curr_state[0]
        time = curr_state[1]
        day = curr_state[2]
        print(action)
        curr_time = curr_time + Time_matrix[i][p][time][day]
        
        
        day = int((day+int(time/24)) % 7)
        time = int(time % 24)
        
        curr_time = curr_time + Time_matrix[p][q][time][day]
        day = int((day+int(time/24)) % 7)
        time = int(time % 24)
        # 3. Append the experience to the memory
        dqn.append_sample(curr_state, action, reward, next_state, terminal_state)
        curr_state = next_state
        
        # 4. Train the model by calling function agent.train_model
        history = dqn.train_model()
        # 5. Keep a track of rewards, Q-values, loss
        
        if history:
            print('rewards: ', reward, 'q-value: ', q_value)
            print('loss:', history.history['loss'])
        
        if curr_time >= 24*30:
            terminal_state = True
    
        # store total reward obtained in this episode
    rewards_per_episode.append(reward)
    episodes.append(episode)
        
    if dqn.epsilon_max > dqn.epsilon_min:
        dqn.epsilon_max *= dqn.epsilon_decay        
        
        

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 32)                26912     
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 21)                693       
Total params: 28,661
Trainable params: 28,661
Non-trainable params: 0
_________________________________________________________________
Number of actions available 13
Episode : 0
Exploring
Selected action  [3 0]
[3 0]
Exploring
Selected action  [0 0]
[0 0]
Exploring
Selected action  [0 0]
[0 0]
Exploring
Selected action  [1 2]
[1 2]
Exploring
Selected action  [1 3]
[1 3]
Exploring
Selected action  [1 2]
[1 2]
Exploring
Selected action  [4 2]
[4 2]
Exploring
Selected action  [1 2]
[1 2]
Exploring
Selected action  [4 2]
[4 2]
Explorin

rewards:  -87.0 q-value:  0
loss: [2619.329833984375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -92.0 q-value:  0
loss: [3141.464599609375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [2344.776123046875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -71.0 q-value:  0
loss: [2387.755859375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -61.0 q-value:  0
loss: [2557.438232421875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -48.0 q-value:  0
loss: [2769.4765625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [3269.598388671875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [2326.53125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -32.0 q-value:  0
loss: [2720.67529296875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [2776.519287109375]
Exploring
Selected action  [2 4]
[2

Epoch 1/1
rewards:  74.0 q-value:  0
loss: [3435.11572265625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  68.0 q-value:  0
loss: [4009.5185546875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  86.0 q-value:  0
loss: [3403.7548828125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  114.0 q-value:  0
loss: [5108.9951171875]
Number of actions available 7
Episode : 1
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [2993.88525390625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [2605.1181640625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  18.0 q-value:  0
loss: [4047.58642578125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  12.0 q-value:  0
loss: [4246.8330078125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  12.0 q-value:  0
loss: [4494.99560546875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -28.0 q-value:  0
loss: [3525.0268

rewards:  -156.0 q-value:  0
loss: [14128.9072265625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -161.0 q-value:  0
loss: [20098.05078125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -158.0 q-value:  0
loss: [14830.4755859375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -160.0 q-value:  0
loss: [15297.3359375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -176.0 q-value:  0
loss: [19351.123046875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -177.0 q-value:  0
loss: [13054.650390625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -178.0 q-value:  0
loss: [12498.2197265625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -166.0 q-value:  0
loss: [22153.826171875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [20494.99609375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -160.0 q-value:  0
loss: [13633.521484375]
Exploring
Selected action  [0 

Epoch 1/1
rewards:  -195.0 q-value:  0
loss: [22205.34375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -185.0 q-value:  0
loss: [27353.18359375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -177.0 q-value:  0
loss: [39006.6328125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -179.0 q-value:  0
loss: [17760.69921875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -186.0 q-value:  0
loss: [16186.4990234375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -188.0 q-value:  0
loss: [35986.2578125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -180.0 q-value:  0
loss: [31172.109375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -186.0 q-value:  0
loss: [35223.7890625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -191.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [21058.28515625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -213.0 q-va

Epoch 1/1
rewards:  -396.0 q-value:  0
loss: [28318.359375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -401.0 q-value:  0
loss: [39078.44921875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -431.0 q-value:  0
loss: [30372.078125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -427.0 q-value:  0
loss: [32530.20703125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -432.0 q-value:  0
loss: [18169.998046875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -432.0 q-value:  0
loss: [20846.37109375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -432.0 q-value:  0
loss: [40392.1875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -437.0 q-value:  0
loss: [27385.673828125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -437.0 q-value:  0
loss: [40680.3515625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -433.0 q-value:  0
loss: [35958.78125]
Exploring
Selected action  [4 1]
[4 1]


rewards:  -30.0 q-value:  0
loss: [59688.80859375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [47449.84375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -41.0 q-value:  0
loss: [57568.5859375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -27.0 q-value:  0
loss: [46500.7265625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -32.0 q-value:  0
loss: [58281.5859375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -37.0 q-value:  0
loss: [32350.984375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [59393.5625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -34.0 q-value:  0
loss: [51239.92578125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -45.0 q-value:  0
loss: [30849.796875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -51.0 q-value:  0
loss: [44718.421875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -52.0 

rewards:  -63.0 q-value:  0
loss: [33099.44921875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -72.0 q-value:  0
loss: [39801.66015625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -72.0 q-value:  0
loss: [42444.4140625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [39056.8359375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -87.0 q-value:  0
loss: [35136.2890625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [66153.53125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -55.0 q-value:  0
loss: [37727.8515625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [30039.25]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -60.0 q-value:  0
loss: [57192.1875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -70.0 q-value:  0
loss: [22813.0234375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -51.0 q-

rewards:  21.0 q-value:  0
loss: [36890.015625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  31.0 q-value:  0
loss: [38477.3125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [32440.341796875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [51025.46484375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [36768.23046875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [29105.43359375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [38959.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [46644.46875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  3.0 q-value:  0
loss: [44055.2578125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -13.0 q-value:  0
loss: [41445.2109375]
Number of actions available 6
Episode : 3
Exploring
Selected action  [1 0]
[1

rewards:  -35.0 q-value:  0
loss: [28978.576171875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [46127.59375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -25.0 q-value:  0
loss: [51118.05859375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -30.0 q-value:  0
loss: [42757.25390625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -30.0 q-value:  0
loss: [26365.81640625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -30.0 q-value:  0
loss: [15914.4248046875]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -26.0 q-value:  0
loss: [46338.09765625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -26.0 q-value:  0
loss: [37704.7734375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [32253.369140625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [31667.587890625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1

rewards:  -27.0 q-value:  0
loss: [43364.33984375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -34.0 q-value:  0
loss: [39583.95703125]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -27.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [50679.703125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -37.0 q-value:  0
loss: [26920.521484375]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -31.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [26516.826171875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -41.0 q-value:  0
loss: [37602.5390625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [26754.259765625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -57.0 q-value:  0
loss: [28074.578125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -62.0 q-value:  0
loss: [15859.5]
Exploring
Selected action 

rewards:  -63.0 q-value:  0
loss: [27457.3203125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [27431.65625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [22057.4375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [12937.845703125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [36498.01171875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [17166.13671875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [18862.6875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -89.0 q-value:  0
loss: [26474.765625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -119.0 q-value:  0
loss: [16703.384765625]
Number of actions available 11
Episode : 4
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [22304.59765625]
Exploring
Selected

rewards:  -133.0 q-value:  0
loss: [30222.4765625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -148.0 q-value:  0
loss: [38794.6796875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -153.0 q-value:  0
loss: [25129.39453125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [5401.2939453125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -114.0 q-value:  0
loss: [29784.5703125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -114.0 q-value:  0
loss: [26785.51171875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -106.0 q-value:  0
loss: [21581.650390625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -110.0 q-value:  0
loss: [19844.30859375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -94.0 q-value:  0
loss: [32801.6328125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -62.0 q-value:  0
loss: [39564.2734375]
Exploring
Selected action  [0 1]
[0 1]
Epoch

rewards:  -79.0 q-value:  0
loss: [19699.21875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -111.0 q-value:  0
loss: [32827.15234375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -91.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [25012.349609375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -93.0 q-value:  0
loss: [19543.58984375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -94.0 q-value:  0
loss: [22819.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [17454.080078125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -83.0 q-value:  0
loss: [23031.09375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [41638.609375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -112.0 q-value:  0
loss: [18022.4375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -117.0 q-value:  0
loss: [28373.511

Epoch 1/1
rewards:  -23.0 q-value:  0
loss: [17812.0390625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -28.0 q-value:  0
loss: [54678.96875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [21011.751953125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -34.0 q-value:  0
loss: [16790.15234375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -35.0 q-value:  0
loss: [22561.494140625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [22826.58984375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [38108.00390625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -54.0 q-value:  0
loss: [36340.90625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [19070.55078125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [16569.9765625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 

Epoch 1/1
rewards:  -196.0 q-value:  0
loss: [22917.046875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -201.0 q-value:  0
loss: [32401.8515625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -206.0 q-value:  0
loss: [13833.9169921875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -207.0 q-value:  0
loss: [40501.8046875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -208.0 q-value:  0
loss: [19119.029296875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -209.0 q-value:  0
loss: [21569.5]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -213.0 q-value:  0
loss: [18886.677734375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -217.0 q-value:  0
loss: [22622.748046875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -222.0 q-value:  0
loss: [28071.306640625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -227.0 q-value:  0
loss: [25267.490234375]
Exploring
Selected action  [4 0]


rewards:  -334.0 q-value:  0
loss: [35461.0703125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -335.0 q-value:  0
loss: [21220.517578125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -340.0 q-value:  0
loss: [16864.04296875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -341.0 q-value:  0
loss: [39755.96875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -341.0 q-value:  0
loss: [30542.34765625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -342.0 q-value:  0
loss: [20072.71875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -342.0 q-value:  0
loss: [20053.94921875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -342.0 q-value:  0
loss: [15498.66796875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -342.0 q-value:  0
loss: [20542.01171875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -347.0 q-value:  0
loss: [20266.484375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/

rewards:  -469.0 q-value:  0
loss: [63426.3359375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -470.0 q-value:  0
loss: [37322.02734375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -471.0 q-value:  0
loss: [30767.1015625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -472.0 q-value:  0
loss: [31019.03125]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -480.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [33311.4375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -482.0 q-value:  0
loss: [23077.798828125]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -484.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27891.71875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -489.0 q-value:  0
loss: [31205.63671875]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -491.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

rewards:  -605.0 q-value:  0
loss: [72206.84375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -609.0 q-value:  0
loss: [66547.15625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -610.0 q-value:  0
loss: [41572.46875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -615.0 q-value:  0
loss: [68833.28125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -618.0 q-value:  0
loss: [38854.234375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -624.0 q-value:  0
loss: [52951.96875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -627.0 q-value:  0
loss: [57327.34375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -630.0 q-value:  0
loss: [35919.953125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -635.0 q-value:  0
loss: [47675.0625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -636.0 q-value:  0
loss: [40834.5625]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -636.0 q-

rewards:  -107.0 q-value:  0
loss: [56878.8125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -119.0 q-value:  0
loss: [60643.625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -124.0 q-value:  0
loss: [55934.15625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [70526.015625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -136.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [53835.1875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -112.0 q-value:  0
loss: [35134.3125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -88.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [72452.34375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -76.0 q-value:  0
loss: [29477.78125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -79.0 q-value:  0
loss: [38282.375]
Exploring
Selected action  [4 0]
[4 0]
Epoch

rewards:  -110.0 q-value:  0
loss: [51586.28515625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -115.0 q-value:  0
loss: [86930.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -120.0 q-value:  0
loss: [62810.34375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -108.0 q-value:  0
loss: [57277.09375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -114.0 q-value:  0
loss: [47739.125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [51495.8828125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -132.0 q-value:  0
loss: [52213.05859375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -124.0 q-value:  0
loss: [68072.5625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -150.0 q-value:  0
loss: [61844.1875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -146.0 q-value:  0
loss: [47198.96875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -140.0

rewards:  10.0 q-value:  0
loss: [56443.32421875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [49648.625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [41795.625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -27.0 q-value:  0
loss: [38626.875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -20.0 q-value:  0
loss: [31093.0390625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [26669.0]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [53852.453125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [44642.57421875]
Number of actions available 6
Episode : 7
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  36.0 q-value:  0
loss: [85898.375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [46094.15625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/

rewards:  -13.0 q-value:  0
loss: [53149.84375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -18.0 q-value:  0
loss: [46296.75]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [79143.0703125]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -28.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [46899.3671875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [81395.25]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -46.0 q-value:  0
loss: [52178.375]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -18.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [51244.8671875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -23.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [67194.5390625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -26.0 q-value:  

rewards:  74.0 q-value:  0
loss: [42118.6875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  102.0 q-value:  0
loss: [70424.21875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  114.0 q-value:  0
loss: [77062.6875]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  111.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [55913.96875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  104.0 q-value:  0
loss: [56357.5]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  105.0 q-value:  0
loss: [39400.9375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  100.0 q-value:  0
loss: [40099.78125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  95.0 q-value:  0
loss: [54227.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  90.0 q-value:  0
loss: [46462.40625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  78.0 q-value:  0
loss: [57119.1875]
Exploring
Selected action  [0

Epoch 1/1
rewards:  -58.0 q-value:  0
loss: [54823.03125]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -34.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [44483.5625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -35.0 q-value:  0
loss: [33630.34765625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [26102.46875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -67.0 q-value:  0
loss: [50364.40625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [44696.96875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -73.0 q-value:  0
loss: [62860.71875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -74.0 q-value:  0
loss: [31846.25]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -70.0 q-value:  0
loss: [50381.46484375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -93.0 q-value:  0
loss: [32499.65625]
E

rewards:  -183.0 q-value:  0
loss: [46817.53125]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -189.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11567.03515625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -191.0 q-value:  0
loss: [53755.59375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -192.0 q-value:  0
loss: [39748.51953125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -189.0 q-value:  0
loss: [42106.96875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -179.0 q-value:  0
loss: [35630.15625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -196.0 q-value:  0
loss: [12460.53125]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -202.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [41371.21875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -199.0 q-value:  0
loss: [61720.8359375]
Exploring
Selected action  

Epoch 1/1
rewards:  -237.0 q-value:  0
loss: [18779.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -209.0 q-value:  0
loss: [51633.3984375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -244.0 q-value:  0
loss: [63127.09375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -240.0 q-value:  0
loss: [40038.8125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -265.0 q-value:  0
loss: [51821.9375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -270.0 q-value:  0
loss: [65819.03125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -275.0 q-value:  0
loss: [20200.6875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -278.0 q-value:  0
loss: [33390.5625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -278.0 q-value:  0
loss: [41238.375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -283.0 q-value:  0
loss: [72984.546875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -26

Epoch 1/1
rewards:  -67.0 q-value:  0
loss: [35817.875]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -97.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27769.1875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -109.0 q-value:  0
loss: [39431.84375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -139.0 q-value:  0
loss: [72003.6875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -169.0 q-value:  0
loss: [75501.625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -169.0 q-value:  0
loss: [38910.15625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -166.0 q-value:  0
loss: [88188.96875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -167.0 q-value:  0
loss: [53642.875]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -172.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [32310.53125]
Exploiting
Selected action  [1 2]
[

rewards:  -283.0 q-value:  0
loss: [49098.4375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -306.0 q-value:  0
loss: [44045.5]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -316.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [67648.1171875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -325.0 q-value:  0
loss: [44596.34375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -364.0 q-value:  0
loss: [14384.7861328125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -358.0 q-value:  0
loss: [66801.9375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -342.0 q-value:  0
loss: [71586.296875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -345.0 q-value:  0
loss: [43875.70703125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -337.0 q-value:  0
loss: [56079.0]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -341.0 q-value:  0
loss: [22839.064453125

rewards:  -414.0 q-value:  0
loss: [48888.21875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -444.0 q-value:  0
loss: [44839.84375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -444.0 q-value:  0
loss: [29029.78125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -442.0 q-value:  0
loss: [24881.90625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -445.0 q-value:  0
loss: [56628.109375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -452.0 q-value:  0
loss: [53847.875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -455.0 q-value:  0
loss: [61131.5]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -456.0 q-value:  0
loss: [79788.8125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -453.0 q-value:  0
loss: [49225.8125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -453.0 q-value:  0
loss: [31145.125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -441.0 q-value:  0


Epoch 1/1
rewards:  -52.0 q-value:  0
loss: [81668.4140625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -66.0 q-value:  0
loss: [41030.5625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -67.0 q-value:  0
loss: [69807.78125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -68.0 q-value:  0
loss: [69214.5]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -69.0 q-value:  0
loss: [36031.5625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -74.0 q-value:  0
loss: [61734.78125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [56243.16015625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -68.0 q-value:  0
loss: [45377.96875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -92.0 q-value:  0
loss: [62530.40625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [38200.8046875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -91.0 q-v

Epoch 1/1
rewards:  -286.0 q-value:  0
loss: [34010.25]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -291.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [43600.96875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -296.0 q-value:  0
loss: [66026.09375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -298.0 q-value:  0
loss: [46807.34375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -272.0 q-value:  0
loss: [50074.40625]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -287.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [68422.5078125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -292.0 q-value:  0
loss: [72363.1640625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -292.0 q-value:  0
loss: [65159.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -297.0 q-value:  0
loss: [87063.640625]
Exploring
Selected acti

rewards:  -61.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [25497.875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [32532.3125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [38439.96875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -80.0 q-value:  0
loss: [29593.84375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -82.0 q-value:  0
loss: [57392.89453125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [74793.65625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -104.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [66122.28125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -106.0 q-value:  0
loss: [74551.5]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -108.0 q-value:  0
loss: [42104.875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1

Epoch 1/1
rewards:  -329.0 q-value:  0
loss: [45088.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -334.0 q-value:  0
loss: [20680.96875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -339.0 q-value:  0
loss: [53262.22265625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -344.0 q-value:  0
loss: [55483.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -349.0 q-value:  0
loss: [55726.95703125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -359.0 q-value:  0
loss: [55547.15234375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -361.0 q-value:  0
loss: [41938.90625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -386.0 q-value:  0
loss: [29912.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -391.0 q-value:  0
loss: [40103.46875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -393.0 q-value:  0
loss: [60629.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewar

rewards:  -694.0 q-value:  0
loss: [55453.625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -694.0 q-value:  0
loss: [62535.375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -699.0 q-value:  0
loss: [48720.09375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -704.0 q-value:  0
loss: [53911.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -709.0 q-value:  0
loss: [61030.625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -714.0 q-value:  0
loss: [59601.75390625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -719.0 q-value:  0
loss: [47480.875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -724.0 q-value:  0
loss: [43411.65625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -724.0 q-value:  0
loss: [67497.125]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -759.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [82038.0]
Exploring
Selec

Epoch 1/1
rewards:  -1039.0 q-value:  0
loss: [75209.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1044.0 q-value:  0
loss: [87794.125]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1049.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [43995.375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1054.0 q-value:  0
loss: [92659.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1059.0 q-value:  0
loss: [60227.03125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1064.0 q-value:  0
loss: [72383.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1069.0 q-value:  0
loss: [63232.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1074.0 q-value:  0
loss: [49516.875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1109.0 q-value:  0
loss: [71294.90625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1109.0 q-value:  0
loss: [92937.75]


rewards:  -1379.0 q-value:  0
loss: [66229.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1384.0 q-value:  0
loss: [205025.625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1419.0 q-value:  0
loss: [51955.3125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1419.0 q-value:  0
loss: [85323.71875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -1424.0 q-value:  0
loss: [82852.34375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1459.0 q-value:  0
loss: [215929.1875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1464.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [230533.21875]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1499.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [108541.34375]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1499.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

Epoch 1/1
rewards:  -1784.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [106245.40625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -1789.0 q-value:  0
loss: [209324.25]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1824.0 q-value:  0
loss: [142651.90625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1824.0 q-value:  0
loss: [117095.9375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -1829.0 q-value:  0
loss: [215726.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -1829.0 q-value:  0
loss: [242831.625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -1864.0 q-value:  0
loss: [143197.46875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -1869.0 q-value:  0
loss: [117675.40625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1874.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [275790.9375]
Exploring
Selec

rewards:  -2384.0 q-value:  0
loss: [188162.875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -2384.0 q-value:  0
loss: [168766.78125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -2419.0 q-value:  0
loss: [219730.375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -2424.0 q-value:  0
loss: [248479.59375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -2424.0 q-value:  0
loss: [259086.15625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -2429.0 q-value:  0
loss: [420352.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -2434.0 q-value:  0
loss: [249557.03125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -2434.0 q-value:  0
loss: [290806.71875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -2469.0 q-value:  0
loss: [163249.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -2469.0 q-value:  0
loss: [414828.21875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
reward

rewards:  92.0 q-value:  0
loss: [299809.875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  100.0 q-value:  0
loss: [295687.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  95.0 q-value:  0
loss: [170170.40625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  90.0 q-value:  0
loss: [172289.75]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  114.0 q-value:  0
loss: [381799.4375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  113.0 q-value:  0
loss: [95137.84375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  108.0 q-value:  0
loss: [278931.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  103.0 q-value:  0
loss: [651533.125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  107.0 q-value:  0
loss: [247117.0625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  95.0 q-value:  0
loss: [157228.625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  90.0 q-value:  0
loss: [42518

rewards:  126.0 q-value:  0
loss: [328219.21875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  150.0 q-value:  0
loss: [288060.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  145.0 q-value:  0
loss: [282586.0]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  143.0 q-value:  0
loss: [624822.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  137.0 q-value:  0
loss: [289221.125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  131.0 q-value:  0
loss: [278056.3125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  126.0 q-value:  0
loss: [352980.71875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  149.0 q-value:  0
loss: [226850.34375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  123.0 q-value:  0
loss: [172242.0625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  118.0 q-value:  0
loss: [211861.96875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  95.0 q-value:  0
loss:

rewards:  56.0 q-value:  0
loss: [256776.203125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  40.0 q-value:  0
loss: [121585.84375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  31.0 q-value:  0
loss: [428057.5]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  28.0 q-value:  0
loss: [301984.875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  40.0 q-value:  0
loss: [490694.0]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  43.0 q-value:  0
loss: [383761.5]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  51.0 q-value:  0
loss: [339658.96875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  77.0 q-value:  0
loss: [178802.25]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  51.0 q-value:  0
loss: [199434.5625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  75.0 q-value:  0
loss: [248860.5]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  103.0 q-value:  0
loss: [235752.9375]
Exp

rewards:  -11.0 q-value:  0
loss: [451158.3125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [345006.03125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  9.0 q-value:  0
loss: [703533.875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [177103.28125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [736420.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [159043.8125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [401187.4375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [245372.34375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -24.0 q-value:  0
loss: [280963.3125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [762540.5]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -22.0 q-value:  0
loss: [

Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [463489.96875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -142.0 q-value:  0
loss: [442560.09375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -143.0 q-value:  0
loss: [163875.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -148.0 q-value:  0
loss: [256105.6875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -153.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [426664.65625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -134.0 q-value:  0
loss: [45223.0]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -126.0 q-value:  0
loss: [385891.8125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -131.0 q-value:  0
loss: [528403.8125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -143.0 q-value:  0
loss: [387204.25]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [401644.5

rewards:  -238.0 q-value:  0
loss: [378699.9375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -243.0 q-value:  0
loss: [342867.28125]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -239.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [909041.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -219.0 q-value:  0
loss: [381199.3125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -204.0 q-value:  0
loss: [221592.34375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -188.0 q-value:  0
loss: [153361.34375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -195.0 q-value:  0
loss: [287268.0625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -183.0 q-value:  0
loss: [381560.4375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -186.0 q-value:  0
loss: [397959.8125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -176.0 q-value:  0
loss: [301413.84375]

Epoch 1/1
rewards:  157.0 q-value:  0
loss: [551671.375]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  177.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [264945.125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  137.0 q-value:  0
loss: [104828.3125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  137.0 q-value:  0
loss: [364171.0]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  140.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [144066.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  135.0 q-value:  0
loss: [608248.875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  155.0 q-value:  0
loss: [91310.0625]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  146.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [481299.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  141.0 q-value:

rewards:  269.0 q-value:  0
loss: [258514.25]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  273.0 q-value:  0
loss: [324616.3125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  284.0 q-value:  0
loss: [168479.1875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  295.0 q-value:  0
loss: [115702.640625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  319.0 q-value:  0
loss: [493207.4375]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  323.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [515517.125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  330.0 q-value:  0
loss: [583093.0]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  329.0 q-value:  0
loss: [322819.03125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  322.0 q-value:  0
loss: [251008.109375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  346.0 q-value:  0
loss: [242575.8125]
Exploring
Sele

rewards:  10.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [152721.4375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  26.0 q-value:  0
loss: [375540.21875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [258063.8125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  14.0 q-value:  0
loss: [287282.84375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [105021.59375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  20.0 q-value:  0
loss: [288764.125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  9.0 q-value:  0
loss: [275901.65625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  32.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [150758.5]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  26.0 q-value:  0
loss: [493457.53125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards

rewards:  -188.0 q-value:  0
loss: [442860.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -194.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [119726.96875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -201.0 q-value:  0
loss: [308662.84375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -193.0 q-value:  0
loss: [572725.0]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -196.0 q-value:  0
loss: [159818.71875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -176.0 q-value:  0
loss: [467715.40625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -186.0 q-value:  0
loss: [309943.0]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -179.0 q-value:  0
loss: [221450.09375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -155.0 q-value:  0
loss: [380063.65625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -149.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0

rewards:  -162.0 q-value:  0
loss: [503023.6875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -151.0 q-value:  0
loss: [467356.375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -149.0 q-value:  0
loss: [266649.6875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [455392.8125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -168.0 q-value:  0
loss: [519272.6875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -160.0 q-value:  0
loss: [126852.8125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -156.0 q-value:  0
loss: [209551.25]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -164.0 q-value:  0
loss: [371025.78125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -171.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [564761.9375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -173.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 

Epoch 1/1
rewards:  124.0 q-value:  0
loss: [540754.5]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  113.0 q-value:  0
loss: [477729.78125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  112.0 q-value:  0
loss: [188015.0]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  94.0 q-value:  0
loss: [433620.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  89.0 q-value:  0
loss: [230869.0]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  109.0 q-value:  0
loss: [113345.28125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  101.0 q-value:  0
loss: [240818.75]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  98.0 q-value:  0
loss: [234169.40625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  96.0 q-value:  0
loss: [360888.78125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  76.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [251867.03125]
Exploring
Sel

rewards:  60.0 q-value:  0
loss: [84399.0]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  58.0 q-value:  0
loss: [51443.1875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  82.0 q-value:  0
loss: [761884.0]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  52.0 q-value:  0
loss: [639023.5]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  16.0 q-value:  0
loss: [322969.25]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  43.0 q-value:  0
loss: [805193.25]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  50.0 q-value:  0
loss: [519788.0625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  60.0 q-value:  0
loss: [377015.0625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  83.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [244436.625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  119.0 q-value:  0
loss: [780129.75]
Exploring
Selected action  [4 3]
[4 3]
E

Epoch 1/1
rewards:  -17.0 q-value:  0
loss: [296445.5625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -22.0 q-value:  0
loss: [359314.15625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -27.0 q-value:  0
loss: [307224.4375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -7.0 q-value:  0
loss: [291146.15625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -12.0 q-value:  0
loss: [335435.875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  2.0 q-value:  0
loss: [350339.84375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  0.0 q-value:  0
loss: [364076.53125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [587727.75]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [569546.8125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -15.0 q-value:  0
loss: [175097.0625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -20.0 q-value: 

Epoch 1/1
rewards:  -75.0 q-value:  0
loss: [549578.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -80.0 q-value:  0
loss: [80646.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -85.0 q-value:  0
loss: [259639.453125]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -90.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [224202.4375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [416201.3125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -102.0 q-value:  0
loss: [53751.375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [903192.5]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -118.0 q-value:  0
loss: [162318.1875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -123.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [444163.125]
Exploiting
Selected action  [0 0]
[0 0

rewards:  -357.0 q-value:  0
loss: [506386.9375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -343.0 q-value:  0
loss: [86857.78125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -345.0 q-value:  0
loss: [347915.5625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -341.0 q-value:  0
loss: [221990.5]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -342.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [76115.40625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -357.0 q-value:  0
loss: [340427.71875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -362.0 q-value:  0
loss: [363726.21875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -338.0 q-value:  0
loss: [375776.8125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -344.0 q-value:  0
loss: [70822.90625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -335.0 q-value:  0
loss: [521113.84375]
Exp

Epoch 1/1
rewards:  -463.0 q-value:  0
loss: [276486.6875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -443.0 q-value:  0
loss: [201724.28125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -446.0 q-value:  0
loss: [510676.9375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -451.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [188129.59375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -456.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [273080.8125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -461.0 q-value:  0
loss: [690347.375]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -489.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [266351.875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -472.0 q-value:  0
loss: [288971.40625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards

Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [224716.9375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -79.0 q-value:  0
loss: [477114.875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -115.0 q-value:  0
loss: [476992.1875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -111.0 q-value:  0
loss: [543831.875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -112.0 q-value:  0
loss: [467499.8125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -133.0 q-value:  0
loss: [750400.5]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -129.0 q-value:  0
loss: [883017.125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -125.0 q-value:  0
loss: [422200.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -130.0 q-value:  0
loss: [222550.4375]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -121.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [420020.90625]

rewards:  -102.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [254186.6875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -105.0 q-value:  0
loss: [585065.25]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [328666.46875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -100.0 q-value:  0
loss: [213515.46875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [388117.5625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -106.0 q-value:  0
loss: [636649.375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -102.0 q-value:  0
loss: [445254.625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [482435.375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [320714.28125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -148.0 q-value:  0
loss: [402820.28125]
Exploit

Epoch 1/1
rewards:  -113.0 q-value:  0
loss: [603585.0625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -118.0 q-value:  0
loss: [650516.75]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -118.0 q-value:  0
loss: [140039.75]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -111.0 q-value:  0
loss: [282307.53125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [361012.96875]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -107.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [639937.875]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -99.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [381331.5]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -104.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [518864.59375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -85.0

Epoch 1/1
rewards:  -10.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [288627.875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -30.0 q-value:  0
loss: [169265.25]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -6.0 q-value:  0
loss: [248961.9375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  3.0 q-value:  0
loss: [269715.90625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  2.0 q-value:  0
loss: [406067.4375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [278123.96875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -19.0 q-value:  0
loss: [368274.09375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -18.0 q-value:  0
loss: [419599.90625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -39.0 q-value:  0
loss: [389904.28125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -38.0 q-value:  0
loss: [473414.4375]
Explori

rewards:  -143.0 q-value:  0
loss: [600289.375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -171.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [475299.375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -151.0 q-value:  0
loss: [195873.34375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -143.0 q-value:  0
loss: [178093.40625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [892919.375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -146.0 q-value:  0
loss: [205866.4375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -122.0 q-value:  0
loss: [314031.75]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -110.0 q-value:  0
loss: [593645.8125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -135.0 q-value:  0
loss: [526157.375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [215461.8125]
Explori

rewards:  -48.0 q-value:  0
loss: [304148.3125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -50.0 q-value:  0
loss: [292882.59375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -42.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [224143.34375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -44.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [274860.03125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [455193.90625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -50.0 q-value:  0
loss: [504957.15625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -52.0 q-value:  0
loss: [261885.65625]
Number of actions available 8
Episode : 20
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [557732.375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [521947.9375]

Epoch 1/1
rewards:  72.0 q-value:  0
loss: [252608.09375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  91.0 q-value:  0
loss: [528472.5]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  99.0 q-value:  0
loss: [30938.28125]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  93.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [273910.9375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  91.0 q-value:  0
loss: [208637.09375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  86.0 q-value:  0
loss: [73984.46875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  58.0 q-value:  0
loss: [485937.5]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  55.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [677895.875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [231007.90625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/

rewards:  -89.0 q-value:  0
loss: [46575.15625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -91.0 q-value:  0
loss: [91457.59375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [310039.1875]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -109.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [264337.96875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -106.0 q-value:  0
loss: [414549.90625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -98.0 q-value:  0
loss: [198846.15625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -102.0 q-value:  0
loss: [570582.25]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [73119.0]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -97.0 q-value:  0
loss: [169640.3125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -125.0 q-value:  0
loss: [65252.9375]
Exploring
Se

rewards:  -145.0 q-value:  0
loss: [115115.09375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [34316.46875]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -148.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [242220.0625]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -145.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [268272.71875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -145.0 q-value:  0
loss: [340020.125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -147.0 q-value:  0
loss: [253023.75]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -149.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [364038.09375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -137.0 q-value:  0
loss: [44656.46875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -140.0 q

Epoch 1/1
rewards:  -276.0 q-value:  0
loss: [149765.03125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -272.0 q-value:  0
loss: [29308.84375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -277.0 q-value:  0
loss: [406269.15625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -273.0 q-value:  0
loss: [557761.875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -275.0 q-value:  0
loss: [211518.0625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -277.0 q-value:  0
loss: [361712.78125]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -287.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [525224.5]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -270.0 q-value:  0
loss: [183002.90625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -275.0 q-value:  0
loss: [315901.9375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -280.0 q-value:  0
loss: [285876

Epoch 1/1
rewards:  -341.0 q-value:  0
loss: [588624.125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -344.0 q-value:  0
loss: [193841.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -349.0 q-value:  0
loss: [306885.78125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -333.0 q-value:  0
loss: [188863.375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -333.0 q-value:  0
loss: [140740.34375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -321.0 q-value:  0
loss: [475948.0]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -314.0 q-value:  0
loss: [52811.09375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -314.0 q-value:  0
loss: [281340.09375]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -318.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [50997.5625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -323.0 q-value:  [[0. 0. 0. 0. 0.

rewards:  -72.0 q-value:  0
loss: [314021.90625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -77.0 q-value:  0
loss: [260297.1875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -94.0 q-value:  0
loss: [141216.28125]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -99.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [307571.21875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -116.0 q-value:  0
loss: [191026.15625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -113.0 q-value:  0
loss: [105136.59375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -115.0 q-value:  0
loss: [415345.28125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -120.0 q-value:  0
loss: [391822.96875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -126.0 q-value:  0
loss: [319358.46875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [402120.25]
E

Epoch 1/1
rewards:  -301.0 q-value:  0
loss: [403066.5]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -306.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [392286.71875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -308.0 q-value:  0
loss: [86657.84375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -313.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [155917.25]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -315.0 q-value:  0
loss: [452162.5625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -317.0 q-value:  0
loss: [610498.625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -322.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [301357.71875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -327.0 q-value:  0
loss: [548507.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -33

[2 4]
Epoch 1/1
rewards:  -520.0 q-value:  0
loss: [62802.40625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -521.0 q-value:  0
loss: [313489.15625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -522.0 q-value:  0
loss: [423562.4375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -523.0 q-value:  0
loss: [559947.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -524.0 q-value:  0
loss: [244114.59375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -525.0 q-value:  0
loss: [268894.59375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -531.0 q-value:  0
loss: [255560.96875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -533.0 q-value:  0
loss: [631059.875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -535.0 q-value:  0
loss: [454565.03125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -537.0 q-value:  0
loss: [196120.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
re

Epoch 1/1
rewards:  -675.0 q-value:  0
loss: [290663.78125]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -683.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [554398.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -688.0 q-value:  0
loss: [414220.53125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -691.0 q-value:  0
loss: [395922.0625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -694.0 q-value:  0
loss: [541050.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -699.0 q-value:  0
loss: [184553.46875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -709.0 q-value:  0
loss: [622893.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -714.0 q-value:  0
loss: [601146.8125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -716.0 q-value:  0
loss: [166650.78125]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -718.0 q-value:  [[0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  20.0 q-value:  0
loss: [173472.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  15.0 q-value:  0
loss: [83002.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [205683.78125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  36.0 q-value:  0
loss: [135877.6875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [535296.5]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [187667.6875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [678769.3125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [527028.625]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  18.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [331183.25]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  12.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Epoch 1/1
rewards:  -42.0 q-value:  0
loss: [318771.34375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -28.0 q-value:  0
loss: [223860.9375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [654300.6875]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -20.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [196842.9375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -25.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [373073.8125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -24.0 q-value:  0
loss: [188635.875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -24.0 q-value:  0
loss: [269394.03125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [485316.9375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [666203.75]
Exploring
Selected action  [0 0]
[0 0]


rewards:  -100.0 q-value:  0
loss: [522299.4375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -108.0 q-value:  0
loss: [443962.9375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -113.0 q-value:  0
loss: [202797.4375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -115.0 q-value:  0
loss: [707807.25]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -107.0 q-value:  0
loss: [450348.90625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -112.0 q-value:  0
loss: [85939.1875]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -104.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [210401.0]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -109.0 q-value:  0
loss: [314554.15625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [399967.25]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [513756.5]
Exploring
Se

rewards:  -42.0 q-value:  0
loss: [99965.84375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -35.0 q-value:  0
loss: [480910.6875]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -40.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [176190.46875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -20.0 q-value:  0
loss: [446222.34375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -10.0 q-value:  0
loss: [350487.125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -52.0 q-value:  0
loss: [134024.15625]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -54.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [199768.6875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [112468.5]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -58.0 q-value:  0
loss: [201751.96875]
Exploring
Selected action  [2 4]
[2 4]
Epoch

Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [508254.03125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  1.0 q-value:  0
loss: [501665.0625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  21.0 q-value:  0
loss: [396551.4375]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  29.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [131418.65625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  37.0 q-value:  0
loss: [310410.34375]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  31.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [136845.71875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  26.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [374797.9375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  34.0 q-value:  0
loss: [280331.4375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  58.0 q-val

Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  115.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [92798.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  110.0 q-value:  0
loss: [345705.09375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  122.0 q-value:  0
loss: [450371.03125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  97.0 q-value:  0
loss: [381003.0625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  120.0 q-value:  0
loss: [979676.125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  152.0 q-value:  0
loss: [129005.59375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  149.0 q-value:  0
loss: [341930.65625]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  147.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [291008.96875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  162.0 q-value:  0
loss: [342010.71875]
Expl

rewards:  -46.0 q-value:  0
loss: [240598.09375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -47.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [336944.84375]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -79.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [236796.9375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [87137.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -69.0 q-value:  0
loss: [305934.65625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -64.0 q-value:  0
loss: [412029.15625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -65.0 q-value:  0
loss: [80828.40625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -66.0 q-value:  0
loss: [109889.0]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -71.0 q-value:  0
loss: [457845.46875]
Exploring
Selected action  [1 3]
[1 3]
Epoc

Epoch 1/1
rewards:  -166.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [122459.21875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -162.0 q-value:  0
loss: [261341.0]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -163.0 q-value:  0
loss: [338711.28125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -168.0 q-value:  0
loss: [203469.46875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -173.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [274881.0]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -176.0 q-value:  0
loss: [700743.75]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -160.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [219084.84375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -162.0 q-value:  0
loss: [350203.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1

rewards:  -170.0 q-value:  0
loss: [330279.65625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -182.0 q-value:  0
loss: [425277.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -187.0 q-value:  0
loss: [957602.5625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -183.0 q-value:  0
loss: [250573.984375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -190.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24197.53125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -194.0 q-value:  0
loss: [336532.21875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -197.0 q-value:  0
loss: [287276.09375]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -185.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [695784.0]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -187.0 q-value:  0
loss: [153484.0]
Exploring
Selected action  [0 3]
[0 3]

rewards:  17.0 q-value:  0
loss: [209486.46875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  41.0 q-value:  0
loss: [270763.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  36.0 q-value:  0
loss: [427984.59375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  40.0 q-value:  0
loss: [281886.46875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  63.0 q-value:  0
loss: [159489.78125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  27.0 q-value:  0
loss: [180508.09375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  32.0 q-value:  0
loss: [136904.84375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  29.0 q-value:  0
loss: [549666.25]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  33.0 q-value:  0
loss: [186582.5625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  45.0 q-value:  0
loss: [125732.84375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  14.0 q-value:  0
loss: [3

Epoch 1/1
rewards:  16.0 q-value:  0
loss: [22646.3125]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  20.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [348806.53125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [19379.03125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [1004859.875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [291803.28125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -33.0 q-value:  0
loss: [364744.625]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -63.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [362656.625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [449772.28125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -65.0 q-value:  0
loss: [164266.8125]
Exploring
Selected action  [3 1]
[3 1]
E

rewards:  37.0 q-value:  0
loss: [384746.46875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  57.0 q-value:  0
loss: [58670.28125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  60.0 q-value:  0
loss: [223569.78125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  59.0 q-value:  0
loss: [214233.3125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  65.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [411218.21875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  81.0 q-value:  0
loss: [61313.96875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  89.0 q-value:  0
loss: [35760.90625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  84.0 q-value:  0
loss: [52723.8125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  67.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [61653.15625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
re

Epoch 1/1
rewards:  69.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [28333.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  85.0 q-value:  0
loss: [63107.34375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  93.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [28446.375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  97.0 q-value:  0
loss: [21638.03125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  90.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [40659.85546875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  83.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [47393.0]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  78.0 q-value:  0
loss: [74734.59375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  85.0 q-value:  0
loss: [44546.46875]
Exploring
S

rewards:  -13.0 q-value:  0
loss: [44033.4375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -24.0 q-value:  0
loss: [47028.4375]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -39.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [29849.46875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [38531.71875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [55951.59375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -54.0 q-value:  0
loss: [44369.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [57103.875]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -59.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [52490.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -64.0 q-value:  0
loss: [31728.6875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1


Epoch 1/1
rewards:  -230.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [66360.34375]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -226.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [47049.46875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -231.0 q-value:  0
loss: [64587.90625]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -223.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [48039.875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -219.0 q-value:  0
loss: [57105.09375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -215.0 q-value:  0
loss: [28737.34375]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -216.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [33247.6875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -217.0 q-value:  0
loss: [33137.31

rewards:  -318.0 q-value:  0
loss: [74921.65625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -319.0 q-value:  0
loss: [52000.125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -331.0 q-value:  0
loss: [45468.0]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -308.0 q-value:  0
loss: [39008.84375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -326.0 q-value:  0
loss: [24636.25]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -314.0 q-value:  0
loss: [37181.75]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -316.0 q-value:  0
loss: [37314.625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -309.0 q-value:  0
loss: [33569.96875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -314.0 q-value:  0
loss: [44977.78125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -282.0 q-value:  0
loss: [52684.1875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -286.0 q-value:  0
loss: 

Epoch 1/1
rewards:  -373.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [56353.90625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -378.0 q-value:  0
loss: [76096.25]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -398.0 q-value:  0
loss: [34384.0625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -398.0 q-value:  0
loss: [48393.21875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -398.0 q-value:  0
loss: [39767.28125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -398.0 q-value:  0
loss: [36991.90625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -398.0 q-value:  0
loss: [44770.53125]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -398.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [76856.21875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -403.0 q-value:  0
loss: [51018.21875]
Exploring
Selected action  [0 

Epoch 1/1
rewards:  27.0 q-value:  0
loss: [38622.125]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  35.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [70217.875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  9.0 q-value:  0
loss: [66261.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [33436.125]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  12.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [36255.25]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [79304.875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  1.0 q-value:  0
loss: [46050.09375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -3.0 q-value:  0
loss: [21216.34375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  9.0 q-value:  0
loss: [32823.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  4.

rewards:  -53.0 q-value:  0
loss: [72552.46875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -33.0 q-value:  0
loss: [74295.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [48103.0]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -19.0 q-value:  0
loss: [34793.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -24.0 q-value:  0
loss: [55645.0]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -23.0 q-value:  0
loss: [62713.8125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [36093.53125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -74.0 q-value:  0
loss: [65071.625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -70.0 q-value:  0
loss: [41786.78125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -62.0 q-value:  0
loss: [93768.6875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -67.0 q-value:  [[0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  -117.0 q-value:  0
loss: [54063.53125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -89.0 q-value:  0
loss: [28948.78125]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -89.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [59349.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -94.0 q-value:  0
loss: [84655.59375]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -94.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [71641.46875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [56187.8125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -103.0 q-value:  0
loss: [54886.75]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -91.0 q-value:  0
loss: [80624.0625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -96.0 q-value:  0
loss: [51833.1875]
Exploring
Selected action  [4 0]
[4 0]
E

Epoch 1/1
rewards:  -221.0 q-value:  0
loss: [15810.40625]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -221.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [74838.71875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -221.0 q-value:  0
loss: [44553.5625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -221.0 q-value:  0
loss: [82683.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -226.0 q-value:  0
loss: [56483.53125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -226.0 q-value:  0
loss: [51365.09375]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -234.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [52539.1875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -234.0 q-value:  0
loss: [48053.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -239.0 q-value:  0
loss: [25476.84375]
Exploiting
Selected action  [0

Epoch 1/1
rewards:  7.0 q-value:  0
loss: [44114.40625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  1.0 q-value:  0
loss: [29409.84375]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  0.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [48559.4375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [49234.84375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [65674.40625]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -42.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [79474.53125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [22989.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [29322.875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -22.0 q-value:  0
loss: [71809.5625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1

Epoch 1/1
rewards:  -45.0 q-value:  0
loss: [51901.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -50.0 q-value:  0
loss: [34478.0]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -52.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [64134.46875]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -44.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [36584.15625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -49.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [79895.0625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [75014.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -61.0 q-value:  0
loss: [40895.78125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -68.0 q-value:  0
loss: [37212.1875]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -70.0 q-value: 

rewards:  -62.0 q-value:  0
loss: [69158.8125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -67.0 q-value:  0
loss: [78267.125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -87.0 q-value:  0
loss: [59509.09375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -79.0 q-value:  0
loss: [63068.15625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [41316.09375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -72.0 q-value:  0
loss: [31646.75]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -89.0 q-value:  0
loss: [55060.96875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [59996.21875]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -55.0 q-value:  0
loss: [39846.65625]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -95.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [99883.21875]
Exploring
Selected 

Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  79.0 q-value:  0
loss: [25320.875]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  74.0 q-value:  0
loss: [69239.03125]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  73.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [64523.0625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  71.0 q-value:  0
loss: [45708.375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  70.0 q-value:  0
loss: [63936.4375]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  94.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [52878.78125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  99.0 q-value:  0
loss: [16114.0625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  96.0 q-value:  0
loss: [60282.0]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  101.0 q-value:  0
loss: [27473.6875]
Exploring
Selec

Epoch 1/1
rewards:  220.0 q-value:  0
loss: [43050.25]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  215.0 q-value:  0
loss: [90242.28125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  237.0 q-value:  0
loss: [47262.71875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  219.0 q-value:  0
loss: [50008.0]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  206.0 q-value:  0
loss: [52003.34375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  203.0 q-value:  0
loss: [17513.25]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  202.0 q-value:  0
loss: [43536.21875]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  182.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [45041.5625]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  206.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [47595.59375]
Exploiting
Selected action  [3 4]
[3 4]
Epoch

Epoch 1/1
rewards:  -51.0 q-value:  0
loss: [34045.34375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [63721.6875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [80159.21875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -93.0 q-value:  0
loss: [28867.96875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [77471.125]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -104.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [47630.9375]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -88.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [65239.0625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -69.0 q-value:  0
loss: [61506.25]
Exploiting
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -96.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

Epoch 1/1
rewards:  -234.0 q-value:  0
loss: [38829.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -239.0 q-value:  0
loss: [40219.28125]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -219.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [50857.28125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -223.0 q-value:  0
loss: [37035.0625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -233.0 q-value:  0
loss: [29400.90625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -233.0 q-value:  0
loss: [32363.34375]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -225.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [77105.5]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -225.0 q-value:  0
loss: [56396.8125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -229.0 q-value:  0
loss: [69982.40625]
Exploring
Selected action  [0 3]

Epoch 1/1
rewards:  -304.0 q-value:  0
loss: [52732.6875]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -304.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [47703.34375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -304.0 q-value:  0
loss: [37568.09375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -326.0 q-value:  0
loss: [47886.5]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -322.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [40936.03125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -319.0 q-value:  0
loss: [69390.0625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -321.0 q-value:  0
loss: [34283.21875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -313.0 q-value:  0
loss: [27434.6875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -318.0 q-value:  0
loss: [50690.09375]
Exploring
Selected action  [3 0]

Epoch 1/1
rewards:  -7.0 q-value:  0
loss: [53213.5625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -12.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [35345.0625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  1.0 q-value:  0
loss: [37026.59375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [66946.46875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [48341.09375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [61454.84375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -35.0 q-value:  0
loss: [59065.71875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -3.0 q-value:  0
loss: [41701.9375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  15.0 q-value:  0
loss: [84517.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [43687.21875]
Exploiting
Se

Epoch 1/1
rewards:  47.0 q-value:  0
loss: [22966.125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  55.0 q-value:  0
loss: [41223.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  50.0 q-value:  0
loss: [65671.0625]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  61.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [67672.0625]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  57.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [49650.625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  68.0 q-value:  0
loss: [38455.59375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  84.0 q-value:  0
loss: [22417.375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  76.0 q-value:  0
loss: [43578.15625]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  76.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss:

Epoch 1/1
rewards:  25.0 q-value:  0
loss: [50309.28125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  24.0 q-value:  0
loss: [40938.6875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  27.0 q-value:  0
loss: [17286.875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  22.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [51374.625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  17.0 q-value:  0
loss: [39420.46875]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  31.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [31463.96875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  27.0 q-value:  0
loss: [29234.34375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  45.0 q-value:  0
loss: [38243.40625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  32.0 q-value:  0
loss: [40280.40625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1

rewards:  61.0 q-value:  0
loss: [36747.40625]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  91.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [38121.65625]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  89.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [23806.53125]
Number of actions available 6
Episode : 34
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -6.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [77959.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [31203.71875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  6.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [30707.375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  4.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [29825.21875]
Exploitin

Epoch 1/1
rewards:  65.0 q-value:  0
loss: [14315.65625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  55.0 q-value:  0
loss: [52538.78125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  55.0 q-value:  0
loss: [28718.53125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  55.0 q-value:  0
loss: [28781.125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  50.0 q-value:  0
loss: [40468.15625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  50.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [36671.96875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  50.0 q-value:  0
loss: [35000.375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  45.0 q-value:  0
loss: [57500.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  40.0 q-value:  0
loss: [39910.53125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  40.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

rewards:  -11.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [30768.0625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [46173.28125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -28.0 q-value:  0
loss: [48638.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -45.0 q-value:  0
loss: [74727.625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -9.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [66992.625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -13.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [54162.375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -15.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [72657.25]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -17.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  21.0 q-value:  0
loss: [36800.875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  29.0 q-value:  0
loss: [69963.90625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  22.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [28276.21875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  15.0 q-value:  0
loss: [49602.78125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  17.0 q-value:  0
loss: [40881.90625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -25.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [68707.5]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -27.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [64386.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [68079.96875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -8.0 q-value:  0
l

Epoch 1/1
rewards:  18.0 q-value:  0
loss: [57239.34375]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -12.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [31871.71875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [53210.9375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -10.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [39782.71875]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -6.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [46293.84375]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -28.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [26696.1875]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -5.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [40216.71875]
Exploring
Selected action  [0 3]
[0 

rewards:  -78.0 q-value:  0
loss: [48599.9375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [69281.84375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -86.0 q-value:  0
loss: [46611.84375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -79.0 q-value:  0
loss: [51789.0625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -82.0 q-value:  0
loss: [36817.40625]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -85.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [52300.21875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -73.0 q-value:  0
loss: [61454.78125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [25853.0625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -76.0 q-value:  0
loss: [50596.75]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -72.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Epoch 1/1
rewards:  -241.0 q-value:  0
loss: [29482.40625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -231.0 q-value:  0
loss: [51192.625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -242.0 q-value:  0
loss: [26923.59375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -247.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [44389.625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -243.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [58964.28125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -250.0 q-value:  0
loss: [66030.96875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -234.0 q-value:  0
loss: [24209.5]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -206.0 q-value:  0
loss: [35981.71875]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -221.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  -2.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [44945.625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  22.0 q-value:  0
loss: [42195.34375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  7.0 q-value:  0
loss: [33144.4375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  31.0 q-value:  0
loss: [91312.03125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  29.0 q-value:  0
loss: [37691.34375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  16.0 q-value:  0
loss: [37496.75]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [32839.84375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  20.0 q-value:  0
loss: [74718.125]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  36.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [30569.5]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards

Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [61233.875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -14.0 q-value:  0
loss: [35805.03125]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  10.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [40733.4375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  29.0 q-value:  0
loss: [69924.90625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  16.0 q-value:  0
loss: [19627.4375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  21.0 q-value:  0
loss: [61600.53125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  15.0 q-value:  0
loss: [34996.21875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [21426.96875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  4.0 q-value:  0
loss: [70305.21875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [52416.8125]
Exploring
Selected 

Epoch 1/1
rewards:  13.0 q-value:  0
loss: [18727.1875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  11.0 q-value:  0
loss: [52454.21875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  18.0 q-value:  0
loss: [25107.8125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  8.0 q-value:  0
loss: [31047.6875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  16.0 q-value:  0
loss: [36590.21875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -6.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17204.5625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -26.0 q-value:  0
loss: [24232.96875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [32885.46875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -23.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [33559.15625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 

Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [15034.59375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [11277.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -63.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [44434.8125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -67.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [102980.34375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -72.0 q-value:  0
loss: [17261.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -77.0 q-value:  0
loss: [34083.03125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [17058.59375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -86.0 q-value:  0
loss: [17626.4375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -62.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [32068.34375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [10998.09375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [50006.28125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -111.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [21954.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -118.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [22008.46875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [38813.65625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [27707.1875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -130.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [29596.15625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards: 

rewards:  2.0 q-value:  0
loss: [25261.4375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -6.0 q-value:  0
loss: [6692.6875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [23793.09375]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -58.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [9490.15625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [34674.3125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -39.0 q-value:  0
loss: [16854.375]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -31.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11010.71875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [28818.09375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -41.0 q-value:  0
loss: [42908.28125]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewa

Epoch 1/1
rewards:  -98.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17132.375]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -99.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [26947.0625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [15472.1875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -109.0 q-value:  0
loss: [23313.21875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -117.0 q-value:  0
loss: [9874.1875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -123.0 q-value:  0
loss: [22775.78125]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -147.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [14811.1875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [15378.28125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -132.0 q

Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [24947.1875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -15.0 q-value:  0
loss: [17429.09375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [17520.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -16.0 q-value:  0
loss: [20015.46875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -12.0 q-value:  0
loss: [24297.03125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [12661.65625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -15.0 q-value:  0
loss: [24776.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -47.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [8932.5625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [30194.84375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -51.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  64.0 q-value:  0
loss: [13919.84375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  59.0 q-value:  0
loss: [20489.375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  87.0 q-value:  0
loss: [10616.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  82.0 q-value:  0
loss: [15552.96875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  82.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19273.78125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  82.0 q-value:  0
loss: [23245.125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  82.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [22004.84375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  65.0 q-value:  0
loss: [19258.3125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  85.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
lo

rewards:  142.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [32069.15625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  140.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24750.75]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  117.0 q-value:  0
loss: [18791.75]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  121.0 q-value:  0
loss: [19060.34375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  144.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18675.0]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  104.0 q-value:  0
loss: [16492.3125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  136.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27689.90625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  152.0 q-value:  0
loss: [17748.90625]
Exploring
Selected ac

Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [10650.25]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [15638.5]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [16405.75]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -2.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13021.6875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [13111.1875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [14816.9375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [9758.6875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [22801.15625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -7.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [20089.0]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -14.

rewards:  -71.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [9154.34375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [23134.25]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [12908.4375]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -47.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19963.375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -87.0 q-value:  0
loss: [18620.3125]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -117.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27783.28125]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -117.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19318.75]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -93.0 q-value:  0
loss: [13427.21875]
Exploring
Selected 

Epoch 1/1
rewards:  -276.0 q-value:  0
loss: [18620.34375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -277.0 q-value:  0
loss: [12212.375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -282.0 q-value:  0
loss: [28790.46875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -278.0 q-value:  0
loss: [20956.53125]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -284.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [16894.78125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -274.0 q-value:  0
loss: [20760.65625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -271.0 q-value:  0
loss: [34260.28125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -273.0 q-value:  0
loss: [15366.8125]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -257.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [14349.71875]
Exploring
Selected action  [

Epoch 1/1
rewards:  -406.0 q-value:  0
loss: [16833.3125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -382.0 q-value:  0
loss: [23205.625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -384.0 q-value:  0
loss: [25533.75]
Number of actions available 7
Episode : 41
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [37870.75]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -8.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24048.9375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [22450.125]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -7.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27391.375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -10.0 q-value:  0
loss: [21277.9375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  2.0 q-value:  0
loss: [21800.3125]
Exploring

Epoch 1/1
rewards:  56.0 q-value:  0
loss: [29825.34375]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  80.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [15353.8125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  104.0 q-value:  0
loss: [19033.375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  99.0 q-value:  0
loss: [23623.0625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  94.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19455.5625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  89.0 q-value:  0
loss: [18308.09375]
Exploiting
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  95.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [30301.96875]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  119.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [15091.78125]
Exploring


rewards:  200.0 q-value:  0
loss: [22656.0625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  197.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [33081.6875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  192.0 q-value:  0
loss: [23047.46875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  187.0 q-value:  0
loss: [32507.96875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  199.0 q-value:  0
loss: [31175.4375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  194.0 q-value:  0
loss: [30178.8125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  189.0 q-value:  0
loss: [17441.15625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  193.0 q-value:  0
loss: [23994.34375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  209.0 q-value:  0
loss: [31273.09375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  204.0 q-value:  0
loss: [12537.59375]
Exploiting
Select

Epoch 1/1
rewards:  -2.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [14796.0625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -19.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [25044.5625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -15.0 q-value:  0
loss: [16870.875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -25.0 q-value:  0
loss: [25219.15625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -33.0 q-value:  0
loss: [36372.15625]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -25.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [20061.65625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -30.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [25335.25]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -62.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 

rewards:  -188.0 q-value:  0
loss: [30049.6875]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -171.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [21184.15625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -175.0 q-value:  0
loss: [17030.3125]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -178.0 q-value:  0
loss: [18850.5625]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -150.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [28099.75]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -147.0 q-value:  0
loss: [12493.5625]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -125.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [25156.25]
Exploiting
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -101.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19042.15625]
Exploiting


Epoch 1/1
rewards:  -42.0 q-value:  0
loss: [18410.46875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -54.0 q-value:  0
loss: [12913.15625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [20696.09375]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [25021.1875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -46.0 q-value:  0
loss: [24837.09375]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -18.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [29580.75]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -25.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19078.84375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -52.0 q-value:  0
loss: [20570.78125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -57.0 q-value:  0
loss: [10997.6875]
Exploring
Selected action  [2 4]
[2 4]


Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [22489.28125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -58.0 q-value:  0
loss: [24990.875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -60.0 q-value:  0
loss: [26254.40625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -71.0 q-value:  0
loss: [14498.3125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -65.0 q-value:  0
loss: [18668.90625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -70.0 q-value:  0
loss: [18850.5]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -46.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13786.25]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [25926.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -21.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [28407.09375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch

Epoch 1/1
rewards:  -97.0 q-value:  0
loss: [38699.5]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -119.0 q-value:  0
loss: [17354.0]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -121.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19683.53125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -122.0 q-value:  0
loss: [30842.15625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [25128.8125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -119.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [30382.28125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -122.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13589.96875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -133.0 q-value:  0
loss: [21627.125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -126.0 q-

Epoch 1/1
rewards:  -73.0 q-value:  0
loss: [27615.34375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [26161.375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -82.0 q-value:  0
loss: [35912.3125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -84.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [46019.71875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [21234.46875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -91.0 q-value:  0
loss: [31476.96875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -60.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [31106.0]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -40.0 q-value:  0
loss: [35187.3125]
Number of actions available 4
Episode : 44
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [26833.625]
Ex

rewards:  -156.0 q-value:  0
loss: [10518.46875]
Exploiting
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -162.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18490.90625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -140.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12918.78125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -157.0 q-value:  0
loss: [36536.125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -149.0 q-value:  0
loss: [15324.34375]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -154.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18673.21875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -159.0 q-value:  0
loss: [20338.15625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -151.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18467.8125]
Expl

rewards:  -183.0 q-value:  0
loss: [15629.28125]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -176.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [20984.34375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -173.0 q-value:  0
loss: [21914.21875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -175.0 q-value:  0
loss: [19151.53125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -180.0 q-value:  0
loss: [12349.75]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -192.0 q-value:  0
loss: [14537.03125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -203.0 q-value:  0
loss: [18160.9375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -204.0 q-value:  0
loss: [32679.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -209.0 q-value:  0
loss: [22260.65625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -210.0 q-value:  0
loss: [13274.34375]
Exploit

Epoch 1/1
rewards:  -123.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [7025.03125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -128.0 q-value:  0
loss: [19173.40625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -115.0 q-value:  0
loss: [20900.9375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -117.0 q-value:  0
loss: [19044.90625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -119.0 q-value:  0
loss: [9047.4375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -124.0 q-value:  0
loss: [9155.28125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  -98.0 q-value:  0
loss: [17945.9375]
Number of actions available 9
Episode : 45
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [31071.25]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  4.0 q-value:  0
loss: [14299.46875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -1.0 

Epoch 1/1
rewards:  -77.0 q-value:  0
loss: [17317.59375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -103.0 q-value:  0
loss: [26899.34375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [18327.4375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -110.0 q-value:  0
loss: [12998.3125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -106.0 q-value:  0
loss: [30510.21875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -108.0 q-value:  0
loss: [18108.78125]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -110.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11672.5]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -98.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11809.46875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -111.0 q-value:  0
loss: [15971.9375]
Exploring
Selected action  [1 3]
[

Epoch 1/1
rewards:  -75.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13826.15625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [17963.71875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [16680.375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -86.0 q-value:  0
loss: [18493.0625]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -68.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [19490.09375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -71.0 q-value:  0
loss: [20237.03125]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -73.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [27931.375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -75.0 q-value:  0
loss: [12298.96875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -72.0 q-valu

rewards:  -164.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13201.28125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -155.0 q-value:  0
loss: [13002.5625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -170.0 q-value:  0
loss: [22028.8125]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -170.0 q-value:  0
loss: [16549.96875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -170.0 q-value:  0
loss: [15553.8125]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -170.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18589.0625]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -170.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12562.1875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -158.0 q-value:  0
loss: [15406.6875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -126.0 q-value:

rewards:  90.0 q-value:  0
loss: [17273.03125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  84.0 q-value:  0
loss: [16354.6875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  67.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12459.90625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  62.0 q-value:  0
loss: [12026.03125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  82.0 q-value:  0
loss: [10256.03125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  110.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [15038.65625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  134.0 q-value:  0
loss: [11426.625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  142.0 q-value:  0
loss: [13594.75]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  137.0 q-value:  0
loss: [23150.21875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewar

rewards:  189.0 q-value:  0
loss: [24052.0]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  169.0 q-value:  0
loss: [16915.3125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  185.0 q-value:  0
loss: [15870.34375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  183.0 q-value:  0
loss: [13947.3125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  183.0 q-value:  0
loss: [17256.375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  166.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12155.65625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  166.0 q-value:  0
loss: [14977.90625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  144.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17619.4375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  137.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [

rewards:  3.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13527.03125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [23425.0]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  11.0 q-value:  0
loss: [15215.0625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  24.0 q-value:  0
loss: [23660.625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  30.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24333.46875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  45.0 q-value:  0
loss: [24941.75]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  69.0 q-value:  0
loss: [14031.53125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  60.0 q-value:  0
loss: [19065.0]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  53.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18528.125]
Explo

rewards:  106.0 q-value:  0
loss: [24834.625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  132.0 q-value:  0
loss: [22796.4375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  126.0 q-value:  0
loss: [21970.03125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  121.0 q-value:  0
loss: [24750.90625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  145.0 q-value:  0
loss: [12675.5]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  158.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18786.9375]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  155.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17752.28125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  150.0 q-value:  0
loss: [17254.0625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  162.0 q-value:  0
loss: [14535.625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
reward

rewards:  116.0 q-value:  0
loss: [26217.40625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  111.0 q-value:  0
loss: [22280.96875]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  147.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [18053.75]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  178.0 q-value:  0
loss: [13565.65625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  160.0 q-value:  0
loss: [24894.4375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  184.0 q-value:  0
loss: [15872.65625]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  182.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17600.03125]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  175.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [20862.75]
Exploiting
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  172.0 q-value:  [[0. 0

Epoch 1/1
rewards:  -37.0 q-value:  0
loss: [18844.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -42.0 q-value:  0
loss: [14180.75]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -42.0 q-value:  0
loss: [25200.9375]
Exploiting
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -34.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11875.25]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [20382.75]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  10.0 q-value:  0
loss: [9542.375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  14.0 q-value:  0
loss: [10227.84375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  6.0 q-value:  0
loss: [18233.90625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  1.0 q-value:  0
loss: [19536.9375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -10.0 q-value:  0
loss: [14655.65625]
Exploring
Selected action

Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [22233.90625]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -8.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12186.9375]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  23.0 q-value:  0
loss: [10781.5625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  23.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [14510.03125]
Exploiting
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  -3.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24331.8125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  16.0 q-value:  0
loss: [29958.59375]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  52.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [29882.28125]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  96.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Epoch 1/1
rewards:  280.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [11341.875]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  269.0 q-value:  0
loss: [14451.84375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  264.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17851.4375]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  259.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [15738.0625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  281.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [24677.1875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  285.0 q-value:  0
loss: [15124.0]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  284.0 q-value:  0
loss: [14642.59375]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  283.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Epoch 1/1
rewards:  24.0 q-value:  0
loss: [18165.1875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  20.0 q-value:  0
loss: [16954.28125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  7.0 q-value:  0
loss: [7137.78125]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -5.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17323.21875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  7.0 q-value:  0
loss: [19685.4375]
Exploiting
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  5.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [9752.21875]
Exploiting
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -22.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [17919.15625]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -12.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [23129.8125]
Exploring
Sel

Epoch 1/1
rewards:  10.0 q-value:  0
loss: [15201.3125]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [24263.71875]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  16.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [5072.71875]
Exploiting
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  44.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [9551.90625]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  60.0 q-value:  0
loss: [12428.4375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  52.0 q-value:  0
loss: [12386.90625]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  84.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [5957.03125]
Exploiting
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  82.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12138.78125]
Exploring
Sel

Epoch 1/1
rewards:  12.0 q-value:  0
loss: [11122.1875]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  7.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [15729.59375]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  7.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [12709.90625]
Exploiting
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  7.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [9501.34375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  15.0 q-value:  0
loss: [11317.5]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  19.0 q-value:  0
loss: [13718.28125]
Exploiting
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  51.0 q-value:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
loss: [13639.4375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  45.0 q-value:  0
loss: [11037.53125]
Exploiting
Select

In [None]:
# make directory
if not os.path.exists("saved_pickle_files"):
    os.mkdir("saved_pickle_files")

# save rewards_per_episode
save_pickle(rewards_per_episode, "saved_pickle_files/rewards_per_episode")


# plot results
with open('saved_pickle_files/rewards_per_episode.pkl', 'rb') as f:
    rewards_per_episode = pickle.load(f)

plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.xlabel("episode number")
plt.ylabel("reward per episode")

# save plots in saved_plots/ directory
plt.savefig('rewards.png')

print("Average reward of last 100 episodes is {0}".format(np.mean(rewards_per_episode[-100:])))

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()